diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index bd597344ea..af36f492ba 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @shumway @vidyasagar-amd
+* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd
 # Documentation files
-docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
-*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
-*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
-.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
+docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
+*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
+*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
+.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
+library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd
diff --git a/.github/scripts/therock_configure_ci.py b/.github/scripts/therock_configure_ci.py
index 557afe2d84..cc66fdbfe8 100644
--- a/.github/scripts/therock_configure_ci.py
+++ b/.github/scripts/therock_configure_ci.py
@@ -42,6 +42,24 @@ def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
             file=sys.stderr,
         )
         return None
+    
+GITHUB_WORKFLOWS_CI_PATTERNS = [
+    "therock*",
+]
+
+def is_path_workflow_file_related_to_ci(path: str) -> bool:
+    return any(
+        fnmatch.fnmatch(path, ".github/workflows/" + pattern)
+        for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
+    ) or any(
+        fnmatch.fnmatch(path, ".github/scripts/" + pattern)
+        for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
+    )
+
+def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
+    if paths is None:
+        return False
+    return any(is_path_workflow_file_related_to_ci(p) for p in paths)
 
 # Paths matching any of these patterns are considered to have no influence over
 # build or test workflows so any related jobs can be skipped if all paths
@@ -82,12 +100,16 @@ def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
     )
     other_paths = paths_set - github_workflows_paths
 
+    related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
     contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
 
     print("should_ci_run_given_modified_paths findings:")
     print(f"  contains_other_non_skippable_files: {contains_other_non_skippable_files}")
 
-    if contains_other_non_skippable_files:
+    if related_to_ci:
+        print("Enabling build jobs since a related workflow file was modified")
+        return True
+    elif contains_other_non_skippable_files:
         print("Enabling TheRock CI jobs since a non-skippable path was modified")
         return True
     else:
diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml
index 7db124d2a1..695fb1d913 100644
--- a/.github/workflows/therock-ci-linux.yml
+++ b/.github/workflows/therock-ci-linux.yml
@@ -27,30 +27,35 @@ jobs:
       TEATIME_FORCE_INTERACTIVE: 0
       AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
     steps:
+      - name: "Checking out repository for rocm-libraries"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/rocm-libraries"
+
       - name: Checkout composable_kernel repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          path: "composable_kernel"
 
       - name: Checkout TheRock repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: "ROCm/TheRock"
-          ref: ec1c2ef4f2636bce7733fd8c95e1dbb6692c8a57
+          ref: 409f43ad9d564454bb1b23f8c8aa15d6b9d25200
           path: "TheRock"
 
       - name: Runner Health Settings
         run: |
-          df -h
-          cmake --version
-          echo "Installed Python versions:"
-          ls -d /opt/python
-          echo "python: $(which python), python3: $(which python3)"
-          echo "Git version: $(git --version)"
-          git config --global --add safe.directory $PWD
-          git config fetch.parallel 10
+          ./TheRock/build_tools/health_status.py
       
       - name: Fetch sources
         run: |
-          ./TheRock/build_tools/fetch_sources.py --jobs 12
+          ./TheRock/build_tools/fetch_sources.py --jobs 12 --no-include-rocm-libraries --no-include-ml-frameworks
+
+      - name: Patch rocm-libraries
+        run: |
+          git config --global --add safe.directory '*'
+          git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch
 
       - name: Install python deps
         run: |
@@ -92,32 +97,14 @@ jobs:
           aws-region: us-east-2
           role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
 
-      - name: Create Logs index Files and upload logs
+      - name: Post Build Upload
         if: always()
         run: |
-          python3 TheRock/build_tools/github_actions/create_log_index.py \
-            --build-dir=TheRock/build \
-            --amdgpu-family=${{ env.AMDGPU_FAMILIES }}
-
-          python3 TheRock/build_tools/github_actions/upload_build_logs_to_s3.py \
-            --build-dir=TheRock/build \
-            --run-id ${{ github.run_id }} \
-            --amdgpu-family ${{ env.AMDGPU_FAMILIES }}
-
-      - name: Upload artifacts
-        run: |
-          python TheRock/build_tools/github_actions/upload_build_artifacts.py \
+          python3 TheRock/build_tools/github_actions/post_build_upload.py \
             --run-id ${{ github.run_id }} \
             --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
-            --build-dir TheRock/build
-
-      - name: Add Links to Job Summary
-        if: always()
-        run: |
-          python TheRock/build_tools/github_actions/upload_build_summary.py \
-            --run-id ${{ github.run_id }} \
-            --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
-            --build-dir TheRock/build
+            --build-dir TheRock/build \
+            --upload
 
   therock-test-linux:
     name: "Test"
diff --git a/.github/workflows/therock-ci.yml b/.github/workflows/therock-ci.yml
index 3232652b6b..40a3b0bec8 100644
--- a/.github/workflows/therock-ci.yml
+++ b/.github/workflows/therock-ci.yml
@@ -56,7 +56,14 @@ jobs:
     uses: ./.github/workflows/therock-ci-linux.yml
     secrets: inherit
     with:
-      cmake_options: "-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON -DTHEROCK_ENABLE_MIOPEN=ON -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_USE_EXTERNAL_CK=ON -DTHEROCK_CK_SOURCE_DIR=../"
+      cmake_options: >-
+        -DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON 
+        -DTHEROCK_ENABLE_MIOPEN=ON 
+        -DTHEROCK_ENABLE_ALL=OFF 
+        -DTHEROCK_USE_EXTERNAL_COMPOSABLE_KERNEL=ON 
+        -DTHEROCK_COMPOSABLE_KERNEL_SOURCE_DIR=../composable_kernel
+        -DTHEROCK_USE_EXTERNAL_ROCM_LIBRARIES=ON
+        -DTHEROCK_ROCM_LIBRARIES_SOURCE_DIR=../
       amdgpu_families: "gfx94X-dcgpu"
       test_runs_on: "linux-mi325-1gpu-ossci-rocm"
 
diff --git a/.github/workflows/therock-test-component.yml b/.github/workflows/therock-test-component.yml
new file mode 100644
index 0000000000..674e93c1de
--- /dev/null
+++ b/.github/workflows/therock-test-component.yml
@@ -0,0 +1,71 @@
+name: Test component
+
+on:
+  workflow_call:
+    inputs:
+      artifact_run_id:
+        type: string
+        default: ""
+      amdgpu_families:
+        type: string
+      test_runs_on:
+        type: string
+      platform:
+        type: string
+      component:
+        type: string
+
+
+permissions:
+  contents: read
+
+jobs:
+  test_component:
+    name: 'Test ${{ fromJSON(inputs.component).job_name }} (shard ${{ matrix.shard }} of ${{ fromJSON(inputs.component).total_shards }})'
+    runs-on: ${{ inputs.test_runs_on }}
+    container:
+      image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' || null }}
+      options: --ipc host
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --group-add 992
+        --env-file /etc/podinfo/gha-gpu-isolation-settings
+    strategy:
+      fail-fast: false
+      matrix:
+        # The shard array is based on "total_shards" from "fetch_test_configurations.py"
+        # The test executable will shard based on the array. (ex: [1, 2, 3, 4] = four test shards)
+        shard: ${{ fromJSON(inputs.component).shard_arr }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
+      OUTPUT_ARTIFACTS_DIR: "./build"
+      THEROCK_BIN_DIR: "./build/bin"
+      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: "ROCm/TheRock"
+
+      - name: Run setup test environment workflow
+        uses: './.github/actions/setup_test_environment'
+        with:
+          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
+          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
+          VENV_DIR: ${{ env.VENV_DIR }}
+          FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }}
+          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+
+      - name: Test
+        timeout-minutes: ${{ fromJSON(inputs.component).timeout_minutes }}
+        env:
+          SHARD_INDEX: ${{ matrix.shard }}
+          TOTAL_SHARDS: ${{ fromJSON(inputs.component).total_shards }}
+        run: |
+          ${{ fromJSON(inputs.component).test_script }}
diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml
index 37ddd399ad..54e068eb3d 100644
--- a/.github/workflows/therock-test-packages.yml
+++ b/.github/workflows/therock-test-packages.yml
@@ -37,41 +37,17 @@ jobs:
 
   test_components:
     name: 'Test ${{ matrix.components.job_name }}'
-    runs-on: ${{ inputs.test_runs_on }}
-    needs: configure_test_matrix
+    needs: [configure_test_matrix]
     # skip tests if no test matrix to run
     if: ${{ needs.configure_test_matrix.outputs.components != '[]' }}
     strategy:
       fail-fast: false
       matrix:
         components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }}
-    defaults:
-      run:
-        shell: bash
-    env:
-      VENV_DIR: ${{ github.workspace }}/.venv
-      ARTIFACT_RUN_ID: "${{ github.run_id }}"
-      OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build
-      THEROCK_BIN_DIR: "./build/bin"
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: "ROCm/TheRock"
-
-      - name: Run setup test environment workflow
-        uses: './.github/actions/setup_test_environment'
-        with:
-          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
-          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
-          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
-          VENV_DIR: ${{ env.VENV_DIR }}
-          FETCH_ARTIFACT_ARGS: ${{ matrix.components.fetch_artifact_args }}
-          PLATFORM: ${{ inputs.platform }}
-          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
-
-      - name: Test
-        timeout-minutes: ${{ matrix.components.timeout_minutes }}
-        run: |
-          if [ "${{ inputs.PLATFORM }}" == "linux" ]; then source ${VENV_DIR}/bin/activate ; else . ${VENV_DIR}/Scripts/activate ; fi
-          ${{ matrix.components.test_script }}
+    uses: './.github/workflows/therock-test-component.yml'
+    with:
+      artifact_run_id: ${{  github.run_id }}
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      test_runs_on: ${{ inputs.test_runs_on }}
+      platform: ${{ inputs.platform }}
+      component: ${{ toJSON(matrix.components) }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 664c5219e2..2d936d3a48 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,12 +6,12 @@ repos:
         entry: clang-format-18 -i --style=file
         language: system
         types_or: [c++, inc]
-    -   id: copyright-year-checker
-        name: copyright-year-checker
-        entry: script/check_copyright_year.sh
-        verbose: false
-        language: script
-        types: [c++]
+    # -   id: copyright-year-checker
+    #     name: copyright-year-checker
+    #     entry: script/check_copyright_year.sh
+    #     verbose: false
+    #     language: script
+    #     types: [c++]
     -   id: remove-exec-bit
         name: Remove executable bit from non-executable files
         entry: script/remove_exec_bit.sh
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 03fadb16f6..4c75de8019 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,9 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 ## Composable Kernel 1.2.0 for ROCm 7.0.0
 
 ### Added
-
+* Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
+* Added the new api to load different memory sizes to SGPR.
+* Added support for B Tensor Preshuffle in CK TILE Grouped GEMM.
 * Added a basic copy kernel example and supporting documentation for new CK Tile developers.
 * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
 * Added a fully asynchronous HOST (CPU) arguments copy flow for CK grouped GEMM kernels.
@@ -15,6 +17,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
 * Added support for Multiple D GEMM
+* Added support for Multiple ABD GEMM
 * Added GEMM pipeline for microscaling (MX) FP8/FP6/FP4 data types
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
@@ -28,7 +31,10 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for elementwise kernel.
 * Added benchmarking support for tile engine GEMM Multi D.
 * Added block scaling support in CK_TILE GEMM, allowing flexible use of quantization matrices from either A or B operands.
-* Added support for skipping LDS to universal GEMM
+* Added the row-wise column-wise quantization for CK_TILE GEMM & CK_TILE Grouped GEMM.
+* Added support for f32 to FMHA (fwd/bwd).
+* Added tensor-wise quantization for CK_TILE GEMM.
+* Added support for skipping LDS to universal GEMM (no A column-wise, B row-wise support)
 
 ### Optimized
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52bb2ccd2d..f4d3a83c34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -220,7 +220,10 @@ rocm_check_target_ids(SUPPORTED_GPU_TARGETS
 
 message(STATUS "Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
 
-if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+# Cache SUPPORTED_GPU_TARGETS for debug
+set(SUPPORTED_GPU_TARGETS "${SUPPORTED_GPU_TARGETS}" CACHE STRING "List of supported GPU targets")
+
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     message(STATUS "Enabling XDL instances")
     add_definitions(-DCK_USE_XDL)
     set(CK_USE_XDL "ON")
@@ -234,6 +237,10 @@ endif()
 # new macro CK_TILE_USE_WMMA in order to separately compile examples for MFMA/WMMA
 set(CK_TILE_USE_WMMA 0)
 
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx10")
+    add_definitions(-DCK_GFX1030_SUPPORT)
+endif()
+
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
     message(STATUS "Enabling WMMA instances")
     add_definitions(-DCK_USE_WMMA)
@@ -335,6 +342,7 @@ endif()
 option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
 option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
 option(ENABLE_ASM_DUMP "Whether to enable assembly dump for kernels." OFF)
+option(ENABLE_JSON_DUMP "Whether to enable json dump for examples." OFF)
 
 if(USE_BITINT_EXTENSION_INT4)
     add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
@@ -348,6 +356,11 @@ if(ENABLE_ASM_DUMP)
     message("CK compiled with ENABLE_ASM_DUMP set to ${ENABLE_ASM_DUMP}")
 endif()
 
+if (ENABLE_JSON_DUMP)
+    add_compile_definitions(CK_ENABLE_JSON_DUMP)
+    message("CK compiled with ENABLE_JSON_DUMP set to ${ENABLE_JSON_DUMP}")
+endif()
+
 ## Threads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 0900b7a1f8..823ae3bae0 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -22,6 +22,9 @@ Xiaoyan Zhou, 2020
 [Jianfeng Yan](https://github.com/j4yan), 2021-2022
 [Jun Liu](https://github.com/junliume), 2021-2024
 
+[John Shumway](https://github.com/shumway), [Vidyasagar Ananthan](https://github.com/vidyasagar-amd), [Christopher Millette](https://github.com/cgmillette), [Maksim Podkorytov](https://github.com/tenpercent), [Thomas Ning](https://github.com/ThomasNing),[Andriy Roshchenko](https://github.com/andriy-ca), [Aviral Goel](https://github.com/AviralGoelAMD), [Cong Ma](https://github.com/CongMa13),[Thrupti Raj Lakshmana Gowda](https://github.com/ThruptiRajLakshmanaGowda), [Emily Martins](https://github.com/ecamartins), [Khushbu Agarwal](https://github.com/amd-khushbu), [Sudhir Kylasa](https://github.com/kylasa), [Jia Luo](https://github.com/JiaLuo-CAN),  2025-
+
+
 ## Product Manager
 [John Afaganis](https://github.com/afagaj)
 
diff --git a/Dockerfile b/Dockerfile
index 6f5cd0115d..07327442fe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,27 +1,23 @@
+
 FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.4.1
+ARG ROCMVERSION=7.0.1
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
+ENV DEBIAN_FRONTEND=noninteractive
 
 # Add rocm repository
 RUN set -xe && \
-    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
-    curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 
-RUN if [ "$ROCMVERSION" != "6.5" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60401-1_all.deb && \
-        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
-        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
-    fi
-
-RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
-    amdgpu-install -y --usecase=rocm --no-dkms
+RUN wget https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/noble/amdgpu-install_7.0.1.70001-1_all.deb && \
+    apt install ./amdgpu-install_7.0.1.70001-1_all.deb -y && \
+    apt update && \
+    apt install python3-setuptools python3-wheel -y && \
+    apt install rocm-dev -y
 
 ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
 ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
@@ -45,7 +41,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     libelf-dev \
     libnuma-dev \
     libpthread-stubs0-dev \
-    llvm-amdgpu \
     mpich \
     net-tools \
     pkg-config \
@@ -61,17 +56,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     zip \
     libzstd-dev \
     openssh-server \
-    clang-format-12 \
     clang-format-18 \
     kmod && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* && \
     rm -rf amdgpu-install* && \
-# Remove unnecessary rocm components that take a lot of space
-    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
-
 #Install latest ccache
-RUN git clone https://github.com/ccache/ccache.git && \
+    git clone https://github.com/ccache/ccache.git && \
     cd ccache && mkdir build && cd build && cmake .. && make install && \
 #Install ninja build tracing tools
     cd / && \
diff --git a/Dockerfile.aiter b/Dockerfile.aiter
index 245e39fb75..b61c1e41a5 100644
--- a/Dockerfile.aiter
+++ b/Dockerfile.aiter
@@ -1,10 +1,8 @@
-ARG BASE_DOCKER="rocm/pytorch:latest"
+ARG BASE_DOCKER="rocm/composable_kernel-private:ck_aiter_base"
 FROM $BASE_DOCKER
 ARG AITER_BRANCH="main"
 ARG CK_AITER_BRANCH="develop"
-RUN groupadd -g 109 render && \
-    usermod -u 1001 jenkins && \
-    groupmod -g 1001 jenkins && \
+RUN groupadd irc && \
     pip install pandas zmq einops && \
     pip install numpy==1.26.2 && \
     sudo mkdir /home/jenkins && \
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
index 0306057e45..47bd8294b6 100644
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4.1"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.0.1"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
diff --git a/Jenkinsfile b/Jenkinsfile
index e7e57aded9..bb904052bd 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -33,9 +33,6 @@ def nthreads() {
     def nproc = sh(returnStdout: true, script: 'nproc')
     echo "Number of cores: ${nproc}"
     def n = nproc.toInteger()
-    if (n > 32){
-        n /= 2
-    }
     if (n > 64){
         n = 64
     }
@@ -56,7 +53,7 @@ def getBaseDockerImageName(){
     }
     else{
         def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
-        if ( ROCM_numeric.major <= 6 && ROCM_numeric.minor < 5 ){
+        if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 1 ){
             img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
             }
         else{
@@ -152,7 +149,7 @@ def getDockerImage(Map conf=[:]){
         image = conf.get("docker_name", "")
         echo "Using legacy docker: ${image}"
     }
-    else if ( params.BUILD_GFX950 && conf.get("docker_name", "") != "" ){
+    else if ( (params.BUILD_GFX950 || params.RUN_CK_TILE_FMHA_TESTS) && conf.get("docker_name", "") != "" ){
         image = conf.get("docker_name", "")
         echo "Using special docker: ${image}"
     }
@@ -160,9 +157,9 @@ def getDockerImage(Map conf=[:]){
         image = getDockerImageName()
         echo "Using default docker: ${image}"
     }
-    //Check if image exists 
+    //Check if image exists
     def retimage
-    try 
+    try
     {
         echo "Pulling image: ${image}"
         retimage = docker.image("${image}")
@@ -189,11 +186,11 @@ def buildDocker(install_prefix){
         dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f Dockerfile.compiler . "
     }
     else if(params.RUN_AITER_TESTS){
-        image_name = "rocm/composable_kernel:ck_aiter"
+        image_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
         dockerArgs = dockerArgs + " --no-cache -f Dockerfile.aiter --build-arg AITER_BRANCH='${params.aiter_branch}' --build-arg CK_AITER_BRANCH='${params.ck_aiter_branch}' . "
     }
      else if(params.RUN_PYTORCH_TESTS){
-        image_name = "rocm/composable_kernel:ck_pytorch"
+        image_name = "${env.CK_DOCKERHUB}:ck_pytorch"
         dockerArgs = dockerArgs + " --no-cache -f Dockerfile.pytorch --build-arg CK_PYTORCH_BRANCH='${params.ck_pytorch_branch}' . "
     }
    else{
@@ -235,7 +232,7 @@ def cmake_build(Map conf=[:]){
     def setup_args = conf.get("setup_args","")
     // make sure all unit tests always run on develop branch
     def runAllUnitTests = (env.BRANCH_NAME == "develop") ? true : params.RUN_ALL_UNIT_TESTS
-    
+
     if (prefixpath != "/usr/local"){
         setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
     }
@@ -324,7 +321,7 @@ def cmake_build(Map conf=[:]){
                     ${redis_pre_setup_cmd}
                 """)
             sh cmd1
-            setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
+            setup_args = " -DCMAKE_HIP_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
         }
         catch(Exception err){
             echo "could not connect to redis server: ${err.getMessage()}. will not use sccache."
@@ -360,7 +357,7 @@ def cmake_build(Map conf=[:]){
             "build_cmd",
             "${build_envs} ninja -j${nt} ${config_targets}"
         )
-        
+
         cmd = conf.get("cmd", """
             ${setup_cmd}
             ${build_cmd}
@@ -452,7 +449,7 @@ def buildHipClangJob(Map conf=[:]){
         checkout scm
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
-        // Jenkins is complaining about the render group 
+        // Jenkins is complaining about the render group
         def dockerOpts
         if ( params.BUILD_INSTANCES_ONLY ){
             dockerOpts = "--group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
@@ -479,7 +476,7 @@ def buildHipClangJob(Map conf=[:]){
         def retimage
         (retimage, image) = getDockerImage(conf)
 
-        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
             withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                 timeout(time: 20, unit: 'HOURS')
                 {
@@ -518,7 +515,7 @@ def Build_CK(Map conf=[:]){
         checkout scm
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
-        // Jenkins is complaining about the render group 
+        // Jenkins is complaining about the render group
         def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
         if (conf.get("enforce_xnack_on", false)) {
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
@@ -541,7 +538,7 @@ def Build_CK(Map conf=[:]){
         def image
         def retimage
 
-        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
             try {
                 (retimage, image) = getDockerImage(conf)
                 withDockerContainer(image: image, args: dockerOpts) {
@@ -719,10 +716,10 @@ def process_results(Map conf=[:]){
     env.HSA_ENABLE_SDMA=0
     checkout scm
     //use older image that has user jenkins
-    def image = "rocm/composable_kernel:ck_ub22.04_rocm6.3"
+    def image = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm6.3"
     def prefixpath = "/opt/rocm"
 
-    // Jenkins is complaining about the render group 
+    // Jenkins is complaining about the render group
     def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
     if (conf.get("enforce_xnack_on", false)) {
         dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
@@ -731,7 +728,7 @@ def process_results(Map conf=[:]){
     def variant = env.STAGE_NAME
     def retimage
 
-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
         try
         {
             echo "Pulling image: ${image}"
@@ -830,7 +827,7 @@ def run_aiter_tests(Map conf=[:]){
     env.HSA_ENABLE_SDMA=0
     checkout scm
     //use the latest pytorch image
-    def image = "rocm/composable_kernel:ck_aiter"
+    def image = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
     def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins"
     def variant = env.STAGE_NAME
     def retimage
@@ -839,7 +836,7 @@ def run_aiter_tests(Map conf=[:]){
     dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
     echo "Docker flags: ${dockerOpts}"
 
-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
         try
         {
             echo "Pulling image: ${image}"
@@ -855,13 +852,21 @@ def run_aiter_tests(Map conf=[:]){
     }
 
     withDockerContainer(image: image, args: dockerOpts) {
-        timeout(time: 45, unit: 'MINUTES'){
+        timeout(time: 5, unit: 'HOURS'){
             try{
                 sh "rocminfo"
                 sh "python3 --version"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_2stage.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_blockscale.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_ep.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_sorting.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_sorting_mxfp4.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_tkw1.py"
             }
             catch(e){
                 echo "Throwing error exception while running AITER tests"
@@ -881,7 +886,7 @@ def run_pytorch_tests(Map conf=[:]){
     env.HSA_ENABLE_SDMA=0
     checkout scm
     //use the latest pytorch-nightly image
-    def image = "rocm/composable_kernel:ck_pytorch"
+    def image = "${env.CK_DOCKERHUB}:ck_pytorch"
     def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins"
     def variant = env.STAGE_NAME
     def retimage
@@ -890,7 +895,7 @@ def run_pytorch_tests(Map conf=[:]){
     dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
     echo "Docker flags: ${dockerOpts}"
 
-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
         try
         {
             echo "Pulling image: ${image}"
@@ -906,7 +911,7 @@ def run_pytorch_tests(Map conf=[:]){
     }
 
     withDockerContainer(image: image, args: dockerOpts) {
-        timeout(time: 45, unit: 'MINUTES'){
+        timeout(time: 2, unit: 'HOURS'){
             try{
                 sh "rocminfo"
                 sh "python3 --version"
@@ -926,7 +931,8 @@ def run_pytorch_tests(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_PERFORMANCE_TESTS=true
+                                              0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -952,20 +958,20 @@ pipeline {
             defaultValue: '',
             description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
         string(
-            name: 'ROCMVERSION', 
-            defaultValue: '6.4.1',
-            description: 'Specify which ROCM version to use: 6.4.1 (default).')
+            name: 'ROCMVERSION',
+            defaultValue: '7.0.1',
+            description: 'Specify which ROCM version to use: 7.0.1 (default).')
         string(
-            name: 'COMPILER_VERSION', 
-            defaultValue: '', 
+            name: 'COMPILER_VERSION',
+            defaultValue: '',
             description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).')
         string(
-            name: 'COMPILER_COMMIT', 
-            defaultValue: '', 
+            name: 'COMPILER_COMMIT',
+            defaultValue: '',
             description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit (default), or use some specific commit of llvm-project branch.')
         string(
-            name: 'BUILD_COMPILER', 
-            defaultValue: '/opt/rocm/llvm/bin/clang++', 
+            name: 'BUILD_COMPILER',
+            defaultValue: '/opt/rocm/llvm/bin/clang++',
             description: 'Build CK with /opt/rocm/bin/hipcc, /llvm-project/build/bin/clang++, or with /opt/rocm/llvm/bin/clang++ (default).')
         booleanParam(
             name: "RUN_FULL_QA",
@@ -1029,12 +1035,12 @@ pipeline {
             description: "Build CK and run tests on gfx90a (default: ON)")
         booleanParam(
             name: "BUILD_GFX942",
-            defaultValue: false,
-            description: "Build CK and run tests on gfx942 (default: OFF)")
+            defaultValue: true,
+            description: "Build CK and run tests on gfx942 (default: ON)")
         booleanParam(
             name: "BUILD_GFX950",
-            defaultValue: false,
-            description: "Build CK and run tests on gfx950 (default: OFF)")
+            defaultValue: true,
+            description: "Build CK and run tests on gfx950 (default: ON)")
         booleanParam(
             name: "BUILD_GFX10",
             defaultValue: true,
@@ -1121,15 +1127,16 @@ pipeline {
                     agent{ label rocmnode("nogpu") }
                     environment{
                         setup_args = "NO_CK_BUILD"
-                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
-                                -o -not -path \'*.git*\' -iname \'*.hpp\' \
-                                -o -not -path \'*.git*\' -iname \'*.cpp\' \
-                                -o -iname \'*.h.in\' \
-                                -o -iname \'*.hpp.in\' \
-                                -o -iname \'*.cpp.in\' \
-                                -o -iname \'*.cl\' \
+                        execute_cmd = "(cd .. && git ls-files \'*.h\' \
+                                \'*.hpp\' \
+                                \'*.cpp\' \
+                                \'*.h.in\' \
+                                \'*.hpp.in\' \
+                                \'*.cpp.in\' \
+                                \'*.cl\' \
                                 | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\' && \
+                                | grep -v 'include/rapidjson' \
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\') && \
                                 /cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \
                                 -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 \
                                 -D __gfx908__ -D __gfx90a__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
@@ -1150,15 +1157,17 @@ pipeline {
                     agent{ label rocmnode("nogpu") }
                     environment{
                         setup_args = "NO_CK_BUILD"
-                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
-                                -o -not -path \'*.git*\' -iname \'*.hpp\' \
-                                -o -not -path \'*.git*\' -iname \'*.cpp\' \
-                                -o -iname \'*.h.in\' \
-                                -o -iname \'*.hpp.in\' \
-                                -o -iname \'*.cpp.in\' \
-                                -o -iname \'*.cl\' \
+                        execute_cmd = "(cd .. && git ls-files \
+                                \'*.h\' \
+                                \'*.hpp\' \
+                                \'*.cpp\' \
+                                \'*.h.in\' \
+                                \'*.hpp.in\' \
+                                \'*.cpp.in\' \
+                                \'*.cl\' \
                                 | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\'"
+                                | grep -v 'include/rapidjson' \
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\')"
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
@@ -1201,6 +1210,18 @@ pipeline {
                         cleanWs()
                     }
                 }
+                stage("Run AITER Tests on gfx950")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_AITER_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx950")}
+                    steps{
+                        run_aiter_tests()
+                        cleanWs()
+                    }
+                }
             }
         }
         stage("Run Grouped Conv Large Case Tests")
@@ -1272,7 +1293,7 @@ pipeline {
                     agent{ label rocmnode("gfx90a")}
                     environment{
                         setup_args = "NO_CK_BUILD"
-                        execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
+                        execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake -DCMAKE_PREFIX_PATH=/opt/rocm ../codegen && \
                                            make -j64 check"""
                     }
                     steps{
@@ -1315,7 +1336,7 @@ pipeline {
                     environment{
                         setup_args = "NO_CK_BUILD"
                         execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
-                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
+                                           make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
                                            cd ../ &&
                                            example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
                     }
@@ -1324,6 +1345,25 @@ pipeline {
                         cleanWs()
                     }
                 }
+                stage("Run CK_TILE_FMHA Tests on gfx950")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx950") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx950 && \
+                                           make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
+                                           cd ../ &&
+                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx950 """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
             }
         }
         stage("Run TILE_ENGINE_GEMM Tests")
@@ -1347,23 +1387,15 @@ pipeline {
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
                                             -D GEMM_MULTI_D_DATATYPE="fp16" \
                                             -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
+                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8" \
+                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j64 benchmark_gemm_fp8_rcr && \
-                                           ./bin/benchmark_gemm_fp8_rcr && \
-                                           ninja -j64 benchmark_gemm_fp16_rcr && \
-                                           ./bin/benchmark_gemm_fp16_rcr && \
-                                           ninja -j64 benchmark_gemm_fp8_crr && \
-                                           ./bin/benchmark_gemm_fp8_crr && \
-                                           ninja -j64 benchmark_gemm_fp16_crr && \
-                                           ./bin/benchmark_gemm_fp16_crr && \
-                                           ninja -j64 benchmark_gemm_fp8_ccr && \
-                                           ./bin/benchmark_gemm_fp8_ccr && \
-                                           ninja -j64 benchmark_gemm_fp16_ccr && \
-                                           ./bin/benchmark_gemm_fp16_ccr && \
-                                           ninja -j64 benchmark_gemm_fp8_rrr && \
-                                           ./bin/benchmark_gemm_fp8_rrr && \
-                                           ninja -j64 benchmark_gemm_fp16_rrr && \
-                                           ./bin/benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_all && \
+                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           ninja -j64 benchmark_gemm_preshuffle_all && \
+                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
                                            ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
                                            ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
                                            ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
@@ -1395,23 +1427,15 @@ pipeline {
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
                                             -D GEMM_MULTI_D_DATATYPE="fp16" \
                                             -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
+                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8" \
+                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j64 benchmark_gemm_fp8_rcr && \
-                                           ./bin/benchmark_gemm_fp8_rcr && \
-                                           ninja -j64 benchmark_gemm_fp16_rcr && \
-                                           ./bin/benchmark_gemm_fp16_rcr && \
-                                           ninja -j64 benchmark_gemm_fp8_crr && \
-                                           ./bin/benchmark_gemm_fp8_crr && \
-                                           ninja -j64 benchmark_gemm_fp16_crr && \
-                                           ./bin/benchmark_gemm_fp16_crr && \
-                                           ninja -j64 benchmark_gemm_fp8_ccr && \
-                                           ./bin/benchmark_gemm_fp8_ccr && \
-                                           ninja -j64 benchmark_gemm_fp16_ccr && \
-                                           ./bin/benchmark_gemm_fp16_ccr && \
-                                           ninja -j64 benchmark_gemm_fp8_rrr && \
-                                           ./bin/benchmark_gemm_fp8_rrr && \
-                                           ninja -j64 benchmark_gemm_fp16_rrr && \
-                                           ./bin/benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_all && \
+                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           ninja -j64 benchmark_gemm_preshuffle_all && \
+                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
                                            ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
                                            ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
                                            ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
@@ -1426,6 +1450,36 @@ pipeline {
                         cleanWs()
                     }
                 }
+                stage("Run TILE_ENGINE_GEMM Tests on gfx1201")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx1201") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
+                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+                                            -D CMAKE_BUILD_TYPE=Release \
+                                            -D GPU_TARGETS="gfx1201" \
+                                            -D GEMM_DATATYPE="fp16" \
+                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -DGEMM_CONFIG_FILE=gfx120x_config.json \
+                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
+                                           ninja -j64 benchmark_gemm_all && \
+                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           ninja -j64 benchmark_gemm_fp16_rcr && \
+                                           ninja -j64 benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_fp16_crr && \
+                                           ninja -j64 benchmark_gemm_fp16_ccr """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
             }
         }
 
@@ -1514,7 +1568,7 @@ pipeline {
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
                 }
@@ -1569,7 +1623,7 @@ pipeline {
                     agent{ label rocmnode("gfx942") }
                     steps{
                         script {
-                            def execute_args = params.NINJA_FTIME_TRACE ? 
+                            def execute_args = params.NINJA_FTIME_TRACE ?
                                 """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
                                     -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                     -D CMAKE_BUILD_TYPE=Release \
@@ -1578,8 +1632,8 @@ pipeline {
                                     -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                     -D CMAKE_BUILD_TYPE=Release \
                                     -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
-                            
-                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0")
+
+                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1")
                         }
                         cleanWs()
                     }
@@ -1605,13 +1659,13 @@ pipeline {
                         cleanWs()
                     }
                 }
-                stage("Build CK and run Tests on gfx1101")
+                stage("Build CK and run Tests on gfx11")
                 {
                     when {
                         beforeAgent true
                         expression { params.BUILD_GFX11.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
-                    agent{ label rocmnode("gfx1101") }
+                    agent{ label 'miopen && (gfx1101 || gfx1100)' }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DUSE_OPT_GFX11=ON -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
diff --git a/README.md b/README.md
index 459e17d9a3..32688b6574 100644
--- a/README.md
+++ b/README.md
@@ -184,7 +184,7 @@ hours to 1-2 minutes. In order to invoke sccache, you need to run:
 then add the following flags to the cmake command line:
 
 ```bash
- -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
+ -DCMAKE_HIP_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
 ```
 
 You may need to clean up the build folder and repeat the cmake and make steps in order to take
diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt
index f27e557cc3..21f6e652b8 100644
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -48,7 +48,7 @@ else()
 endif()
 
 if (GPU_TARGETS)
-    if (GPU_TARGETS MATCHES "gfx9")
+    if (GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
         add_definitions(-DCK_USE_XDL)
         set(CK_USE_XDL "ON")
     endif()
diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt
index 2b2e6e2949..80429a781b 100644
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -12,6 +12,7 @@ configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
 find_package(ROCM)
 include(ROCMInstallTargets)
 include(ROCMTest)
+find_package(hiprtc REQUIRED)
 
 rocm_setup_version(VERSION 1.0)
 
@@ -27,7 +28,7 @@ add_compile_options(-std=c++20)
 file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
 # TODO: Use object library
 add_library(ck_host STATIC ${SOURCES})
-target_link_libraries(ck_host PRIVATE ck_headers)
+target_link_libraries(ck_host PRIVATE ck_headers hiprtc::hiprtc)
 
 set_target_properties(ck_host PROPERTIES 
     LINKER_LANGUAGE CXX
diff --git a/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp b/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
index 6029ab0c7d..f233794ec1 100644
--- a/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/host/device_batched_gemm_softmax_gemm/operation.hpp"
 #include "ck/host/stringutils.hpp"
@@ -76,28 +76,28 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 //   Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl| Prefetch|
 //       |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per|    Stage|
 //       |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|         |
-  {   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,        1},
-  {   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,        1},
-  {   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,        1},
-  {   256,    128,   256,    32,   128,    32,   8,   8,    2,   32,   32,     1,     8,     4,        1},
-  {   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,        1},
-  {   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,        1},
-  {   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,        1},
-  {   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,        1},
-  {   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,        1},
-  {   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,        1},
-  {   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,        1},
-  {   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,        1},
+  {   256,    256,   128,    32,    64,    32,   8,   8,    2,   16,   16,     4,     8,     4,        1},
+  {   256,    256,   128,    32,   128,    32,   8,   8,    2,   16,   16,     4,     8,     8,        1},
+  {   256,    128,   256,    32,    64,    32,   8,   8,    2,   16,   16,     2,    16,     4,        1},
+  {   256,    128,   256,    32,   128,    32,   8,   8,    2,   16,   16,     2,    16,     8,        1},
+  {   256,    128,   128,    64,    64,    32,   8,   8,    2,   16,   16,     2,     8,     4,        1},
+  {   256,    128,   128,    32,    64,    32,   8,   8,    2,   16,   16,     2,     8,     4,        1},
+  {   256,    128,   128,    64,   128,    32,   8,   8,    2,   16,   16,     2,     8,     8,        1},
+  {   256,    128,   128,    32,   128,    32,   8,   8,    2,   16,   16,     2,     8,     8,        1},
+  {   256,    128,   256,    32,   128,    32,   8,   8,    2,   16,   16,     2,    16,     8,        1},
+  {   256,    128,   256,    32,    64,    32,   8,   8,    2,   16,   16,     2,    16,     4,        1},
+  {   256,    128,   256,    64,   128,    32,   8,   8,    2,   16,   16,     2,    16,     8,        1},
+  {   256,    128,   256,    64,    64,    32,   8,   8,    2,   16,   16,     2,    16,     4,        1},
 // Padded fallback kernel  
-  {   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,        1},
-  {   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,        1},
+  {   256,    128,   128,    64,   128,    32,   8,   8,    2,   16,   16,     2,     8,     8,        1},
+  {   256,    128,    64,    32,   128,    32,   8,   8,    2,   16,   16,     2,     4,     8,        1},
 // Irregular k
-  {   256,    256,   128,    40,    64,    32,   4,   4,    2,   32,   32,     2,     4,     2,        1},
-  {   256,    256,   128,    40,   128,    32,   4,   4,    2,   32,   32,     2,     4,     4,        1},
-  {   256,    128,   256,    40,    64,    32,   4,   4,    2,   32,   32,     1,     8,     2,        1},
-  {   256,    128,   256,    40,   128,    32,   4,   4,    2,   32,   32,     1,     8,     4,        1},
-  {   256,    128,   128,    40,    64,    32,   4,   4,    2,   32,   32,     1,     4,     2,        1},
-  {   256,    128,   128,    40,   128,    32,   4,   4,    2,   32,   32,     1,     4,     4,        1},
+  {   256,    256,   128,    48,    64,    32,   4,   4,    2,   16,   16,     4,     8,     4,        1},
+  {   256,    256,   128,    48,   128,    32,   4,   4,    2,   16,   16,     4,     8,     8,        1},
+  {   256,    128,   256,    48,    64,    32,   4,   4,    2,   16,   16,     2,    16,     4,        1},
+  {   256,    128,   256,    48,   128,    32,   4,   4,    2,   16,   16,     2,    16,     8,        1},
+  {   256,    128,   128,    48,    64,    32,   4,   4,    2,   16,   16,     2,     8,     4,        1},
+  {   256,    128,   128,    48,   128,    32,   4,   4,    2,   16,   16,     2,     8,     8,        1},
         // clang-format on
     };
 
@@ -200,28 +200,28 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 //         _MBlock_MWaveMPerXdl| ScalarPerVector
 //         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl
 //                             |                
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 16, 1,16>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 16, 1,16>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 16, 1,16>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 16, 1,16>,               4},
+  {              S<1, 32, 1, 8>,               4},
 // Padded fallback kernel
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
 // Irregular k
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
         // clang-format on
     };
 
diff --git a/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp b/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
index fe556615e0..b6cae670fe 100644
--- a/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/host/device_gemm_multiple_d/operation.hpp"
 #include "ck/host/stringutils.hpp"
@@ -81,16 +81,16 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 //   Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per| Prefetch|
 //       |      |      |      |    |    |     |     | Wave| Wave|    Stage|
 //       |      |      |      |    |    |     |     |     |     |         |
-  {   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,        1},
-  {   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,        1},
-  {   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,        1},
-  {   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,        1},
-  {   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,        1},
+  {   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,        1},
+  {   256,   128,   256,    32,   8,   8,   16,   16,    4,    8,        1},
+  {   128,   128,   128,    32,   8,   8,   16,   16,    8,    4,        1},
+  {   256,   128,   128,    32,   8,   8,   16,   16,    4,    4,        1},
+  {   128,   128,    64,    32,   8,   8,   16,   16,    4,    4,        1},
+  {   128,    64,   128,    32,   8,   8,   16,   16,    4,    4,        1},
+  {   256,   128,    64,    32,   8,   8,   16,   16,    4,    2,        1},
+  {   256,    64,   128,    32,   8,   8,   16,   16,    2,    4,        1},
 //  Irregular tile
-  {    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,        1},
+  {    64,    32,    32,    32,   8,   8,   16,   16,    2,    2,        1},
         // clang-format on
     };
 
@@ -194,14 +194,14 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 //         _MBlock_MWaveMPerXdl| ScalarPerVector
 //         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl
 //                             |                
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 16, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 4>,               8},
-  {              S<1, 16, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 16, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 4>,               4},
+  {              S<1, 16, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
 //  Irregular tile
   {              S<1, 16, 1, 4>,               1},
         // clang-format on
diff --git a/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp b/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
index a2f322c50f..26988255c3 100644
--- a/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
 #include <iostream>
@@ -55,12 +55,12 @@ std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> Operation_Conv_Fwd_Xdl_Cshuffle::Cr
 //   Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per| Prefetch|
 //       |      |      |      |    |    |     |     | Wave| Wave|    Stage|
 //       |      |      |      |    |    |     |     |     |     |         |
-  {   64,   64,   32,    32,   8,   8,   32,   32,    2,    1,        1},
-  {   256,   128,   256,    32,   8,   8,   32,   32,    4,    2,        1},
-  {   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   64,   64,   64,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,        1},
-  {   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,        1}
+  {   64,     64,   32,     32,   8,   8,   16,   16,    4,    2,        1},
+  {   256,   128,   256,    32,   8,   8,   16,   16,    8,    4,        1},
+  {   256,   128,   128,    32,   8,   8,   16,   16,    4,    4,        1},
+  {   64,     64,   64,     32,   8,   8,   16,   16,    4,    4,        1},
+  {   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,        1},
+  {   128,   128,   128,    32,   8,   8,   16,   16,    8,    4,        1}
         // clang-format on
     };
 
@@ -116,11 +116,11 @@ std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> Operation_Conv_Fwd_Xdl_Cshuffle::Cr
 //         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl
 //                             |                
   {              S<1, 16, 1, 4>,               1},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 16, 1, 16>,              4},
+  {              S<1, 32, 1, 8>,               4},
   {              S<1, 16, 1, 4>,               1},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 16, 1, 8>,               8}
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 16, 1, 8>,               4}
         // clang-format on
     };
 
@@ -223,8 +223,9 @@ extern "C" __global__ void run_${name}(
     constexpr ck::LoopScheduler LoopSched = ck::make_default_loop_scheduler();
 
     // GridwiseGemm
-    using GridwiseGemm = DeviceConv::GridwiseGemm;
-
+    using GridwiseGemm = ck::conditional_t<ck::get_warp_size() == 64,
+                                           typename DeviceConv::GridwiseGemm64,
+                                           typename DeviceConv::GridwiseGemm32>;
     static constexpr auto I0 = ck::Number<0>{};
 
     ck::tensor_operation::device::device_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
diff --git a/codegen/src/utils.cpp b/codegen/src/utils.cpp
index c15a9fd7d3..4cfe7a117f 100644
--- a/codegen/src/utils.cpp
+++ b/codegen/src/utils.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/host/utils.hpp"
 
@@ -13,7 +13,8 @@ std::size_t integer_divide_ceil(std::size_t x, std::size_t y)
 
 const std::unordered_set<std::string>& get_xdlop_archs()
 {
-    static std::unordered_set<std::string> supported_archs{"gfx90a", "gfx908", "gfx942"};
+    static std::unordered_set<std::string> supported_archs{
+        "gfx90a", "gfx908", "gfx942", "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201"};
     return supported_archs;
 }
 
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
index 9902caab04..15365aadf1 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
@@ -160,9 +160,10 @@ struct Epilogue
                                               Epilogue{1.0f, 1.0f});
     out_host.SetZero();
     ref_invoker.Run(ref_argument);**/
-
+    int i = 0;
     for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
     {
+        std::cout << "Testing solution " << std::to_string(++i) << std::endl;
         // substitute instance values into the template
         auto src = ck::host::InterpolateString(
             conv_compile_check,
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
index 205283e7aa..d7ff793cb8 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
@@ -160,9 +160,10 @@ struct Epilogue
                                               Epilogue{1.0f, 1.0f});
     out_host.SetZero();
     ref_invoker.Run(ref_argument);**/
-
+    int i = 0;
     for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
     {
+        std::cout << "Testing solution " << std::to_string(++i) << std::endl;
         // substitute instance values into the template
         auto src = ck::host::InterpolateString(
             conv_compile_check,
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
index 2b83af2432..1129dbc015 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
@@ -160,9 +160,10 @@ struct Epilogue
                                               Epilogue{1.0f, 1.0f});
     out_host.SetZero();
     ref_invoker.Run(ref_argument);**/
-
+    int i = 0;
     for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
     {
+        std::cout << "Testing solution " << std::to_string(++i) << std::endl;
         // substitute instance values into the template
         auto src = ck::host::InterpolateString(
             conv_compile_check,
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
index fbe27e9c8b..5696178f68 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
@@ -160,9 +160,10 @@ struct Epilogue
                                               Epilogue{1.0f, 1.0f});
     out_host.SetZero();
     ref_invoker.Run(ref_argument);**/
-
+    int i = 0;
     for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
     {
+        std::cout << "Testing solution " << std::to_string(++i) << std::endl;
         // substitute instance values into the template
         auto src = ck::host::InterpolateString(
             conv_compile_check,
diff --git a/docs/Contributors_Guide.rst b/docs/Contributors_Guide.rst
index 1b978ed63e..bd414c08d6 100644
--- a/docs/Contributors_Guide.rst
+++ b/docs/Contributors_Guide.rst
@@ -5,100 +5,58 @@
 .. _contributing-to:
 
 ********************************************************************
-Contributor's guide
+Contributing to Composable Kernel
 ********************************************************************
 
-This chapter explains the rules for contributing to the Composable Kernel project, and how to contribute.
+Review the `Composable Kernel documentation <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_ before contributing to the Composable Kernel project. This documentation provides information about core concepts and configurations, as well as providing :doc:`steps for building Composable Kernel <install/Composable-Kernel-install>`. Some of this information is also available in the `Composable Kernel README <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_.
 
-Getting started
-===============
-
-#. **Documentation:** Before contributing to the library, familiarize yourself with the
-   `Composable Kernel User Guide <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_.
-   It provides insight into the core concepts, environment configuration, and steps to obtain or
-   build the library. You can also find some of this information in the
-   `README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
-   on the project's GitHub page.
-   <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_
-   from the AMD Community portal. It offers a deeper understanding of the library's objectives and showcases its performance capabilities.
-#. **General information:** For broader information about AMD products, consider exploring the
-   `AMD Developer Central portal <https://www.amd.com/en/developer.html>`_.
-
-How to contribute
-===================
-
-You can make an impact by reporting issues or proposing code enhancements through pull requests.
+Consult the `AMD Developer Central portal <https://www.amd.com/en/developer.html>`_ for more information about AMD products.
 
 Reporting issues
-----------------
+=================
 
-Use `Github issues <https://github.com/ROCm/composable_kernel/issues>`_
-to track public bugs and enhancement requests.
+Use `Github issues <https://github.com/ROCm/composable_kernel/issues>`_ to log and track issues and enhancement requests.
 
-If you encounter an issue with the library, please check if the problem has already been
-reported by searching existing issues on GitHub. If your issue seems unique, please submit a new
-issue. All reported issues must include:
+If you encounter an issue with the Composable Kernel library, search the existing GitHub issues to determine whether the problem has already been
+reported. If it hasn't, submit a new issue that includes:
 
-* A comprehensive description of the problem, including:
+* A description of the problem, including what you observed, what you were expecting, and why this was an issue.
+ 
+* Your configuration details, including the GPU, OS, and ROCm version, and any Docker image you used.
 
-  * What did you observe?
-  * Why do you think it is a bug (if it seems like one)?
-  * What did you expect to happen? What would indicate the resolution of the problem?
-  * Are there any known workarounds?
+* The steps to reproduce the issue, including any CMake command you used to build the library, as well as the frequency of the issue.
 
-* Your configuration details, including:
+* Any workarounds you've found and what you expect in a resolution. 
 
-  * Which GPU are you using?
-  * Which OS version are you on?
-  * Which ROCm version are you using?
-  * Are you using a Docker image? If so, which one?
 
-* Steps to reproduce the issue, including:
+Contributing to the codebase
+=============================
 
-  * What actions trigger the issue? What are the reproduction steps?
+All external contributors to the Composable Kernel codebase must follow these guidelines:
 
-    * If you build the library from scratch, what CMake command did you use?
+* Use the correct branch: Use your own branch for your changes. Create your branch from the develop branch. 
 
-  * How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue?
+* Describe your changes: Provide the motivation for the changes and a general description of all code changes.
 
-Before submitting any issue, ensure you have addressed all relevant questions from the checklist.
+* Add design documents for major changes: Major architectural changes must be accompanied by comprehensive design documents uploaded with your pull request. 
 
-Creating Pull Requests
-----------------------
+* Add inline documentation: Include relevant documentation and inline comments with your code changes.
 
-You can submit `Pull Requests (PR) on GitHub
-<https://github.com/ROCm/composable_kernel/pulls>`_.
+* Link your pull request to related issues: Add links to any issues resolved by your changes in your pull request description.
 
-All contributors are required to develop their changes on a separate branch and then create a
-pull request to merge their changes into the `develop` branch, which is the default
-development branch in the Composable Kernel project. All external contributors must use their own
-forks of the project to develop their changes.
+* Verify and test the changes: Run all relevant existing tests and write new tests for any new functionality that isn't covered by existing tests.
 
-When submitting a Pull Request you should:
+* Provide performance numbers: Include documentation showing before and after performance numbers for any changes that potentially impact build times or run times. 
 
-* Describe the change providing information about the motivation for the change and a general
-  description of all code modifications.
+* Keep your branch up to date: Regularly rebase or merge the develop branch back into your feature branch. This should be done both prior to creating your pull request and during the review process.
 
-* Verify and test the change:
+* Ensure a manageable pull request size: Pull requests should be limited to approximately one thousand lines. If your changes significantly exceed one thousand lines, break them into smaller pull requests that can be reviewed independently.
 
-  * Run any relevant existing tests.
-  * Write new tests if added functionality is not covered by current tests.
+* Use pre-commit hooks to adhere to the coding style: Composable Kernel's coding style is defined in `.clang-format <https://github.com/ROCm/composable_kernel/blob/develop/.clang-format>`_. Use the provided pre-commit hooks to run clang formatting and linting. Instructions on installing pre-commit hooks are available in the `README file <https://github.com/ROCm/composable_kernel/blob/develop/.clang-format>`_. 
 
-* Ensure your changes align with the coding style defined in the ``.clang-format`` file located in
-  the project's root directory. We leverage `pre-commit` to run `clang-format` automatically. We
-  highly recommend contributors utilize this method to maintain consistent code formatting.
-  Instructions on setting up `pre-commit` can be found in the project's
-  `README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
+Forks require an approver from AMD to trigger continuous integration (CI) testing. This approval process is necessary for security and resource management.
 
-* Link your PR to any related issues:
+Depending on the complexity of your changes, an  AMD developer might need to pull your changes and perform additional fixes or modifications before merging. This collaborative approach ensures compatibility with internal systems and standards.
 
-  * If there is an issue that is resolved by your change, please provide a link to the issue in
-    the description of your pull request.
+You can see a complete list of pull requests on the `Composable Kernel GitHub page <https://github.com/ROCm/composable_kernel/pulls>`_.
 
-* For larger contributions, structure your change into a sequence of smaller, focused commits, each
-  addressing a particular aspect or fix.
-
-Following the above guidelines ensures a seamless review process and faster assistance from our
-end.
-
-Thank you for your commitment to enhancing the Composable Kernel project! 
diff --git a/docs/index.rst b/docs/index.rst
index 89a5e3e836..c28eb646b5 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -39,6 +39,7 @@ The Composable Kernel repository is located at `https://github.com/ROCm/composab
     * :doc:`Composable Kernel API reference <./doxygen/html/namespace_c_k>`
     * :doc:`CK Tile API reference <./doxygen/html/namespaceck__tile>`
     * :doc:`Composable Kernel complete API class list <./doxygen/html/annotated>`
+    * :doc:`Composable Kernel glossary <./reference/Composable-Kernel-Glossary>`
     
 To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
 
diff --git a/docs/reference/Composable-Kernel-Glossary.rst b/docs/reference/Composable-Kernel-Glossary.rst
new file mode 100644
index 0000000000..847802b903
--- /dev/null
+++ b/docs/reference/Composable-Kernel-Glossary.rst
@@ -0,0 +1,256 @@
+.. meta::
+  :description: Composable Kernel glossary of terms
+  :keywords: composable kernel, glossary
+
+***************************************************
+Composable Kernel glossary
+
+***************************************************
+
+.. glossary::
+    :sorted:
+
+    arithmetic logic unit
+        The arithmetic logic unit (ALU) is the GPU component responsible for arithmetic and logic operations.
+
+    compute unit
+        The compute unit (CU) is the parallel vector processor in an AMD GPU with multiple :term:`ALUs<arithmetic logic unit>`. Each compute unit will run all the :term:`wavefronts<wavefront>` in a :term:`work group>`. A compute unit is equivalent to NVIDIA's streaming   multiprocessor.
+
+    matrix core
+        A matrix core is a specialized GPU unit that accelerate matrix operations for AI and deep learning tasks. A GPU contains multiple matrix cores.
+
+    register
+        Registers are the fastest tier of memory. They're used for storing temporary values during computations and are private to the :term:`work-items<work-item>` that use them.
+
+    VGPR
+        See :term:`vector general purpose register`.
+
+    vector general purpose register 
+        A vector general purpose register (VGPR) is a :term:`register` that stores individual thread data. Each thread in a :term:`wave<wavefront>` has its own set of VGPRs for private variables and calculations. 
+
+    SGPR
+        See :term:`scalar general purpose register`.
+
+    scalar general purpose register
+        A scalar general purpose register (SGPR) is a :term:`register` shared by all the :term:`work items<work item>` in a :term:`wave<wavefront>`. SGPRs are used for constants, addresses, and control flow common across the entire wave.
+
+    LDS
+        See :term:`local data share`.
+
+    local data share
+        Local data share (LDS) is high-bandwidth, low-latency on-chip memory accessible to all the :term:`work-items<work-item>` in a :term:`work group`. LDS is equivalent to NVIDIA's shared memory. 
+
+    LDS banks
+        LDS banks are a type of memory organization where consecutive addresses are distributed across multiple memory banks for parallel access. LDS banks are used to prevent memory access conflicts and improve bandwidth when LDS is used.
+
+    global memory
+        The main device memory accessible by all threads, offering high capacity but higher latency than shared memory.
+
+    pinned memory
+        Pinned memory is :term:`host` memory that is page-locked to accelerate transfers between the CPU and GPU.
+
+    dense tensor
+        A dense tensor is a tensor where most of its elements are non-zero. Dense tensors are typically stored in a contiguous block of memory.
+
+    sparse tensor
+        A sparse tensor is a tensor where most of its elements are zero. Typically only the non-zero elements of a sparse tensor and their indices are stored.
+
+    host
+        Host refers to the CPU and the main memory system that manages GPU execution. The host is responsible for launching kernels, transferring data, and coordinating overall computation.
+
+    device
+        Device refers to the GPU hardware that runs parallel kernels. The device contains the :term:`compute units<compute unit>`, memory hierarchy, and specialized accelerators.
+
+    work-item
+        A work-item is the smallest unit of parallel execution. A work-item runs a single independent instruction stream on a single data element. A work-item is equivalent to an NVIDIA thread.
+
+    wavefront
+        Also referred to as a wave, a wavefront is a group of :term:`work-items<work-item>` that run the same instruction. A wavefront is equivalent to an NVIDIA warp.
+
+    work group
+        A work group is a collection of :term:`work-items<work-item>` that can synchronize and share memory. A work group is equivalent to NVIDIA's thread block. 
+
+    grid
+        A grid is a collection of :term:`work groups<work group>` that run a kernel. Each work group within the grid operates independently and can be scheduled on a different :term:`compute unit`. A grid can be organized into one, two, or three dimensions. A grid is equivalent to an NVIDIA thread block.
+
+    block Size
+        The block size is the number of :term:`work-items<work-item>` in a :term:`compute unit`.
+
+    SIMT
+        See :term:`single-instruction, multi-thread`
+
+    single-instruction, multi-thread 
+        Single-instruction, multi-thread (SIMT) is a parallel computing model where all the :term:`work-items<work-item>` within a :term:`wavefront` run the same instruction on different data. 
+
+    SIMD
+        See :term:`single-instruction, multi-data`
+
+    single-instruction, multi-data
+        Single-instruction, multi-data (SIMD) is a parallel computing model where the same instruction is run with different data simultaneously. 
+
+    occupancy
+        The ratio of active :term:`wavefronts<wavefront>` to the maximum possible number of wavefronts.
+
+    kernel
+        A kernel is a function that runs an :term:`operation` or a collection of operations. A kernel will run in parallel on several :term:`work-items<work-item>` across the GPU. In Composable Kernel, kernels require :term:`pipelines<pipeline>`.
+
+    operation
+        An operation is a computation on input data. 
+        
+    pipeline
+        A Composable Kernel pipeline schedules the sequence of operations for a :term:`kernel`, such as the data loading, computation, and storage phases. A pipeline consists of a :term:`problem` and a :term:`policy`. 
+
+    tile partitioner
+        The tile partitioner defines the mapping between the :term:`problem` dimensions and GPU hierarchy. It specifies :term:`workgroup`-level :term:`tile` sizes and determines :term:`grid` dimensions by dividing the problem size by the tile sizes.
+
+    problem
+        The problem is the part of the :term:`pipeline` that defines input and output shapes, data types, and mathematical :term:`operations<operation>`.
+
+    policy
+        The policy is the part of the :term:`pipeline` that defines memory access patterns and hardware-specific optimizations.
+
+    user customized tile pipeline
+        A customized :term:`tile` :term:`pipeline` that combines custom :term:`problem` and :term:`policy` components for specialized computations. 
+
+    user customized tile pipeline optimization
+        The process of tuning the :term:`tile` size, memory access pattern, and hardware utilization for specific workloads.
+
+    tile programming API
+        The :term:`tile` programming API is Composable Kernel's high-level interface for defining tile-based computations with predefined hardware mappings for data loading and storing.
+
+    coordinate transformation primitives
+        Coordinate transformation primitives are Composable Kernel utilities for converting between different coordinate systems.
+
+    reference kernel
+        A reference :term:`kernel` is a baseline kernel implementation used to verify correctness and performance. Composable Kernel makes two reference kernels, one for CPU and one for GPU, available.
+
+    launch parameters
+        Launch parameters are the configuration values, such as :term:`grid` and :term:`block size`, that determine how a :term:`kernel` is mapped to hardware resources.
+
+    memory coalescing
+        Memory coalescing is an optimization strategy where consecutive :term:`work-items<work-item>` access consecutive memory addresses in such a way that a single memory transaction serves multiple work-items.
+
+    alignment
+        Alignment is a memory management strategy where data structures are stored at addresses that are multiples of a specific value.
+
+
+    bank conflict
+        A bank conflict occurs when multiple :term:`work-items<work-item>` in a :term:`wavefront` access different addresses that map to the same shared memory bank.
+
+    padding
+        Padding is the addition of extra elements, often zeros, to tensor edges in order to control output size in convolution and pooling, or to align data for memory access.
+
+    transpose
+        Transpose is an :term:`operation` that rearranges the order of tensor axes, often for the purposes of matching :term:`kernel` input formats or optimize memory access patterns.
+
+    permute
+        Permute is an :term:`operation` that rearranges the order of tensor axes, often for the purposes of matching :term:`kernel` input formats or optimize memory access patterns.
+
+    host-device transfer
+        A host-device transfer is the process of moving data between :term:`host` and :term:`device` memory. 
+
+    stride
+        A stride is the step size to move from one element to the next in a specific dimension of a tensor or matrix. In convolution and pooling, the stride determines how far the :term:`kernel` moves at each step.
+
+    dilation
+        Dilation is the spacing between :term:`kernel` elements in convolution :term:`operations<operation>`, allowing the receptive field to grow without increasing kernel size.
+
+    Im2Col
+        Im2Col is a data transformation technique that converts image data to column format.
+
+    Col2Im
+        Col2Im is a data transformation technique that converts column data to image format.
+
+    fast changing dimension
+        The fast changing dimension is the innermost dimension in memory layout.
+
+    outer dimension
+        The outer dimension is the slower-changing dimension in memory layout.
+
+    inner dimension
+        The inner dimension is the faster-changing dimension in memory layout.
+
+    tile
+        A tile is a sub-region of a tensor or matrix that is processed by a :term:`work group` or :term:`work-item`. Rectangular data blocks are the unit of computation and memory transfer in Composable Kernel, and are the basis for tiled algorithms.
+
+    block tile
+        A block tile is a memory :term:`tile` processed by a :term:`work group`.
+
+    wave tile
+        A wave :term:`tile` is a sub-tile processed by a single :term:`wavefront` within a :term:`work group`. The wave tile is the base level granularity of a :term:`single-instruction, multi-thread (SIMD)<single-instruction, multi-thread>` model.
+
+    tile distribution
+        The tile distribution is the hierarchical data mapping from :term:`work-items<work-item>` to data in memory.
+
+    tile window
+        Viewport into a larger tensor that defines the current tile's position and boundaries for computation.
+
+    load tile
+        Load tile is an operation that transfers data from :term:`global memory` or the :term:`load data share` to :term:`vector general purpose registers<vector general purpose register>`.
+
+    store tile
+        Store tile is an operation that transfers data from  :term:`vector general purpose registers<vector general purpose register>` to :term:`global memory` or the :term:`load data share`.
+
+    descriptor
+        Metadata structure that defines :term:`tile` properties, memory layouts, and coordinate transformations for Composable Kernel :term:`operations<operation>`.
+
+    input
+        See :term:`problem shape`.
+
+    problem shape
+        The problem shape defines the dimensions and data types of input tensors that define the :term:`problem`.
+
+    vector
+        The vector is the smallest data unit processed by an individual :term:`work-item`. A vectors is typically four to sixteen elements, depending on data type and hardware.
+
+    elementwise
+        An elementwise :term:`operation` is an operation applied to each tensor element independently. 
+
+    epilogue
+        The epilogue is the final stage of a kernel. Activation functions, bias, and other post-processing steps are applied in the epilogue. 
+
+    Add+Multiply
+        See :term:`fused add multiply`.
+
+    fused add multiply
+        A common fused :term:`operation` in machine language and linear algebra, where an :term:`elementwise` addition is immediately followed by a multiplication. Fused add multiply is often used for bias and scaling in neural network layers.
+
+    MFMA
+        See :term:`matrix fused multiply-add`.
+
+    matrix fused multiply-add
+        Matrix fused multiply-add (MFMA) is a :term:`matrix core` instruction for GEMM :term:`operations<operation>`. 
+
+    GEMM
+        See :term:`general matrix multiply`.
+
+    general matrix multiply 
+        A general matrix multiply (GEMM) is a Core matrix :term:`operation` in linear algebra and deep learning. A GEMM is defined as :math:`C = {\alpha}AB + {\beta}C`, where :math:`A`, :math:`B`, and :math:`C` are matrices, and :math:`\alpha` and :math:`\beta` are scalars. 
+
+    VGEMM
+        See :term:`naive GEMM`.
+
+    vanilla GEMM
+        See :term:`naive GEMM`.
+
+    naive GEMM 
+        The naive GEMM, sometimes referred to as a vanilla GEMM or VGEMM, is the simplest form of :term:`GEMM` in Composable Kernel. The naive GEMM is defined as :math:`C = AB`, where :math:`A`, :math:`B`, and :math:`C` are matrices. The naive GEMM is the baseline GEMM that all other GEMM :term:`operations<operation>` build on.
+
+    GGEMM
+        See :term:`grouped GEMM`.
+
+    grouped GEMM
+        A :term:`kernel` that calls multiple :term:`VGEMMs<naive GEMM>`. Each call can have a different :term:`problem shape`. 
+
+    batched GEMM
+        A :term:`kernel` that calls :term:`VGEMMs<naive GEMM>` with different batches of data. All the data batches have the same :term:`problem shape`. 
+
+    Split-K GEMM
+        Split-K GEMM is a parallelization strategy that partitions the reduction dimension (K) of a :term:`GEMM` across multiple :term:`compute units<compute unit>`, increasing parallelism for large matrix multiplications.
+
+    GEMV
+        See :term:`general matrix vector multiplication`
+
+    general matrix vector multiplication
+        General matrix vector multiplication (GEMV) is an :term:`operation` where a matrix is multiplied by a vector, producing another vector. 
+
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 2ef3383d84..33ad8d91f8 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -34,8 +34,14 @@ subtrees:
     title: Composable Kernel vector utilities
   - file: reference/Composable-Kernel-wrapper.rst
     title: Composable Kernel wrapper
+  - file: doxygen/html/namespace_c_k.rst
+    title: CK API reference 
+  - file: doxygen/html/namespaceck__tile.rst
+    title: CK Tile API reference
   - file: doxygen/html/annotated.rst
-    title: Composable Kernel class list
+    title: Full API class list
+  - file: reference/Composable-Kernel-Glossary.rst
+    title: Glossary
 
 - caption: About
   entries:
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 61f3ba5351..03bde86421 100644
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -44,8 +44,7 @@ list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllv
 example_compile_options(example_gemm_xdl_fp8_v3 PRIVATE ${GEMM_OPTIONS})
 example_compile_options(example_gemm_xdl_bf16_v3 PRIVATE ${GEMM_OPTIONS})
 
-
-list(APPEND gpu_list gfx942 gfx950)
+list(APPEND gpu_list gfx942 gfx950 gfx1200 gfx1201 gfx12-generic)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
@@ -89,7 +88,14 @@ foreach(gpu IN LISTS GPU_TARGETS)
 
         add_example_executable(example_gemm_xdl_lds_direct_load_fp16 gemm_xdl_lds_direct_load_fp16.cpp)
         add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp16)
+        set(target 1)
+    endif()
+endforeach()
 
+list(APPEND gpu_list gfx90a gfx942 gfx950 gfx1200 gfx1201 gfx12-generic)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list AND target EQUAL 0)
         add_example_executable(example_gemm_xdl_bf16_streamk_v3 gemm_xdl_bf16_streamk_v3.cpp)
         add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_streamk_v3)
 
@@ -99,6 +105,16 @@ foreach(gpu IN LISTS GPU_TARGETS)
     endif()
 endforeach()
 
+list(APPEND gpu_list_tf32 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
+        add_example_executable(example_gemm_xdl_lds_direct_load_fp32_tf32 gemm_xdl_lds_direct_load_fp32_tf32.cpp)
+        add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp32_tf32)
+        set(target 1)
+    endif()
+endforeach()
+
 add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
 
diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index 434f549443..e482953e46 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -310,10 +310,14 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
     return true;
 }
 
-template <typename DataType>
+template <typename DataType, typename ComputeDataType = DataType>
 inline __host__ __device__ constexpr double get_rtol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
     {
         return 1e-3;
     }
@@ -351,10 +355,14 @@ inline __host__ __device__ constexpr double get_rtol()
     }
 }
 
-template <typename DataType>
+template <typename DataType, typename ComputeDataType = DataType>
 inline __host__ __device__ constexpr double get_atol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
     {
         return 1e-3;
     }
diff --git a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
index 7178ad46b9..9b1d756f85 100644
--- a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -199,9 +199,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         return true;
     }
 
-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
     {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;
 
         return true;
     }
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index 414683ffdf..66a0d98238 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -37,7 +37,7 @@ using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffl
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           2,              S<1, 16, 1, 16>,               8, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           2,              S<1, 16, 1, 16>,               4, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
 // clang-format on
 
 using DeviceGemmInstance = DeviceGemmInstance1;
diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
index e16f184a20..37fa8800a3 100644
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -249,9 +249,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         return true;
     }
 
-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
     {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;
 
         return true;
     }
diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
index f83d479713..483ad800af 100644
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -38,14 +38,14 @@ using DeviceGemmV2Instance =
         AElementOp, BElementOp, CElementOp, GemmDefault, 
         256, Scale_Block_N, Scale_Block_K,
         128, 128,
-        KPerBlock, 8, 32,
-        32,   32,
-        4,    1,
+        KPerBlock, 8, 16,
+        16,   16,
+        8,    2,
         S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
         2, 8, 8, 0,
         S<2, 128, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
-        2, 32, 32, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        2, 16, 16, 0,
+        1, 1, S<1, 16, 1, 16>, 4,
         ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, CDataType, CDataType, PermuteA, PermuteB>;
 
 // clang-format on
@@ -281,9 +281,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         return true;
     }
 
-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
     {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;
 
         return true;
     }
diff --git a/example/01_gemm/gemm_xdl_fp16_v2.cpp b/example/01_gemm/gemm_xdl_fp16_v2.cpp
index ecd3b7be5d..59c059d014 100644
--- a/example/01_gemm/gemm_xdl_fp16_v2.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_v2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -33,13 +33,13 @@ using DeviceGemmInstance =
         2,   256,
         256, 256, 
         32, 8, 4,
-        32,   32,
-        4,    4, 
+        16,   16,
+        8,    8, 
         S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
         2, 8, 8, 0,
         S<8, 32, 1>,  S<0, 2, 1>,  S<0, 2, 1>,
         1, 8, 4, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        1, 1, S<1, 32, 1, 8>, 4,
         ck::LoopScheduler::Default, ck::PipelineVersion::v1>;
 // clang-format on
 
diff --git a/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp b/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
index 266a1e9d3e..b3ca60cea7 100644
--- a/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
@@ -28,7 +28,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 
 static constexpr bool PermuteA = false;
 static constexpr bool PermuteB = false;
-
+static constexpr int KPack     = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
 // clang-format off
 #if 0
 using DeviceGemmV2Instance = 
@@ -56,14 +56,14 @@ using DeviceGemmV2Instance =
         AElementOp, BElementOp, CElementOp, GemmDefault, 
         256,
         256, 256,
-        128, 16, 32,
-        32,   32,
-        4,    4,
+        128, 16, KPack,
+        16,   16,
+        8,    8,
         S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
         2, 16, 16, 0,
         S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
         2, 32, 32, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        1, 1, S<1, 32, 1, 8>, 4,
         ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, F8, F8, PermuteA, PermuteB>;
 
 #endif
@@ -160,7 +160,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     auto gemm = DeviceGemmV2Instance{};
 
     // weight pre-shuffle
-    int KPack = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
     int NLane = gemm.GetPreShuffleParameters();
     int KLane = 64 / NLane;
 
@@ -269,9 +268,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         return true;
     }
 
-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx12_supported()))
     {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950 and gfx12 only" << std::endl;
 
         return true;
     }
diff --git a/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
index 0575314dff..0e6503d21f 100644
--- a/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -38,14 +38,14 @@ using DeviceGemmV2Instance =
         AElementOp, BElementOp, CElementOp, GemmDefault, 
         256,
         128, 128,
-        KPerBlock, 16, 32,
-        32,   32,
-        2,    2,
+        KPerBlock, 16, 16,
+        16,   16,
+        4,    4,
         S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
         2, 16, 16, 0,
         S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
-        2, 32, 32, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        2, 16, 16, 0,
+        1, 1, S<1, 32, 1, 8>, 4,
         ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2, ADataType, ADataType, PermuteA, PermuteB>;
 
 // clang-format on
@@ -247,9 +247,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         return true;
     }
 
-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx12_supported()))
     {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950 and gfx12 only" << std::endl;
 
         return true;
     }
diff --git a/example/01_gemm/gemm_xdl_fp8_v3.cpp b/example/01_gemm/gemm_xdl_fp8_v3.cpp
index da891267b2..a9e39256ba 100644
--- a/example/01_gemm/gemm_xdl_fp8_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -36,7 +36,7 @@ using DeviceGemmV2Instance =
         2, 16, 16, 0,
         S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
         2, 16, 16, 0,
-        1, 2, S<1, 32, 1, 8>, 8,
+        1, 2, S<1, 32, 1, 8>, 4,
         ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ck::f8_t>;
 // clang-format on
 
diff --git a/example/01_gemm/gemm_xdl_lds_direct_load_fp32_tf32.cpp b/example/01_gemm/gemm_xdl_lds_direct_load_fp32_tf32.cpp
new file mode 100644
index 0000000000..9b92fad779
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp32_tf32.cpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "common.hpp"
+
+#define USING_DIRECT_LOADS 1
+#if USING_DIRECT_LOADS
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp"
+#else
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#endif
+
+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
+using F32 = float;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F32;
+using ComputeDataType  = ck::tf32_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+#if USING_DIRECT_LOADS
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer|
+// ######| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockLds|
+// ######| CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| LoopScheduler      |  pipeline ver           | gemm type  |
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|
+// ######| XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,
+           8,   8,   32,   32,    2,    2,      S<4, 8, 8>,     S<1, 0, 2>,              2,              1,         1,      S<4, 8, 8>,     S<1, 0, 2>,             2,              1,         1,
+           1,           1,               S<1, 8, 1, 8>,               4,   ck::LoopScheduler::Default, ck::PipelineVersion::v4, ComputeDataType>;
+// clang-format on
+#else
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 8, 1, 8>,               4>;
+// clang-format on
+#endif
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CElementOp,
+                                                                        ComputeDataType,
+                                                                        ComputeDataType>;
+
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
diff --git a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
index d149fd88f1..d5c42558c4 100644
--- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -36,7 +36,7 @@ using BDataType   = ck::half_t;
 using CDataType   = ck::half_t;
 using AccDataType = float;
 #else  
-                    <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    16,   64,     4,  4,   16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,             4,      4,        7,               1>;
+                    <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    16,   128,     4,  4,   16,   16,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,             4,      4,        7,               1>;
 using ADataType   = float;
 using BDataType   = float;
 using CDataType   = float;
@@ -185,7 +185,6 @@ int main(int argc, char* argv[])
     auto a_element_op = AElementOp{};
     auto b_element_op = BElementOp{};
     auto c_element_op = CElementOp{};
-
     // do GEMM
     auto gemm     = DeviceGemmInstance{};
     auto invoker  = gemm.MakeInvoker();
@@ -209,8 +208,7 @@ int main(int argc, char* argv[])
         return 0;
     }
 
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-
+    float ave_time   = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
         sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
diff --git a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
index d8672f6a0c..76a30657f0 100644
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -29,7 +29,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_WaveletM
 // ######|        |        |        |      Type|      Type|        Type|         DataType|      Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| ThreadGroupSize| ThreadGroupSize| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |            |                 |          |   Operation|   Operation|   Operation|               |    Stage|                |                |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |            |                 |          |            |            |            |               |         |                |                |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType,              F16, CDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,             256,             256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,                S<1, 32, 1,8>,               8>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType,              F16, CDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,             256,             256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,                S<1, 32, 1,8>,               4>;
 // clang-format on
 
 using DeviceGemmInstance = DeviceGemmInstance;
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index 3e018aad1e..7fb0c1e812 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -2,7 +2,11 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
-#include "ck/library/utility/validation_common.hpp"
+
+// use macro to minimize code change
+#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
+using ComputeDataType = AccDataType;
+#endif
 
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
@@ -24,11 +28,11 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
             if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
@@ -54,17 +58,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
     StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
 
-    try
-    {
-        ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-            M, N, K, StrideA, StrideB, StrideC);
-    }
-    catch(const std::runtime_error& e)
-    {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return false;
-    }
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
 
@@ -218,8 +211,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         pass &= ck::utils::check_err(c_m_n_device_result,
                                      c_m_n_host_result,
                                      "Error: Incorrect results!",
-                                     get_rtol<CDataType>(),
-                                     get_atol<CDataType>());
+                                     get_rtol<CDataType, ComputeDataType>(),
+                                     get_atol<CDataType, ComputeDataType>());
 #endif
     }
 
@@ -249,8 +242,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         pass &= ck::utils::check_err(c_m_n_device_result,
                                      c_m_n_device_ref_result,
                                      "Error: Incorrect results!",
-                                     get_rtol<CDataType>(),
-                                     get_atol<CDataType>());
+                                     get_rtol<CDataType, ComputeDataType>(),
+                                     get_atol<CDataType, ComputeDataType>());
     }
 
     return pass == true;
diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
index abf7ef3905..1049b5d07c 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -87,10 +87,10 @@ using DeviceOpInstance =
                                                                    32,
                                                                    8,
                                                                    8,
-                                                                   32,
-                                                                   32,
+                                                                   16,
+                                                                   16,
+                                                                   8,
                                                                    4,
-                                                                   2,
                                                                    S<4, 64, 1>,
                                                                    S<1, 0, 2>,
                                                                    S<1, 0, 2>,
@@ -108,7 +108,7 @@ using DeviceOpInstance =
                                                                    1,
                                                                    1,
                                                                    S<1, 32, 1, 8>,
-                                                                   8>;
+                                                                   4>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
index dffeff2337..992e7c19c8 100644
--- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -83,10 +83,10 @@ using DeviceOpInstance =
                                                                    32,
                                                                    8,
                                                                    8,
-                                                                   32,
-                                                                   32,
+                                                                   16,
+                                                                   16,
+                                                                   8,
                                                                    4,
-                                                                   2,
                                                                    S<4, 64, 1>,
                                                                    S<1, 0, 2>,
                                                                    S<1, 0, 2>,
@@ -104,7 +104,7 @@ using DeviceOpInstance =
                                                                    1,
                                                                    1,
                                                                    S<1, 32, 1, 8>,
-                                                                   8>;
+                                                                   4>;
 
 int main(int argc, char* argv[])
 {
@@ -113,13 +113,13 @@ int main(int argc, char* argv[])
     bool time_kernel     = false;
 
     // GEMM shape
-    ck::index_t M = 3840;
-    ck::index_t N = 4096;
-    ck::index_t K = 4096;
+    ck::index_t M = 1920;
+    ck::index_t N = 2048;
+    ck::index_t K = 2048;
 
-    ck::index_t StrideA = 4096;
-    ck::index_t StrideB = 4096;
-    ck::index_t StrideE = 4096;
+    ck::index_t StrideA = 2048;
+    ck::index_t StrideB = 2048;
+    ck::index_t StrideE = 2048;
 
     if(argc == 1)
     {
@@ -174,6 +174,9 @@ int main(int argc, char* argv[])
     Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
     Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
 
+    const auto StrideD = std::is_same<decltype(ELayout{}), ck::tensor_layout::gemm::RowMajor>::value
+                             ? d_m_n.mDesc.GetStrides()[0]
+                             : d_m_n.mDesc.GetStrides()[1];
     std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
     std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
     std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
@@ -221,7 +224,7 @@ int main(int argc, char* argv[])
                                K,
                                StrideA,
                                StrideB,
-                               std::array<ck::index_t, 1>{0},
+                               std::array<ck::index_t, 1>{static_cast<int>(StrideD)},
                                StrideE,
                                a_element_op,
                                b_element_op,
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
index e630f67837..4e98bf3034 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
index 71f6677bae..05994a9bbd 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
index 4665c3932f..81388a178f 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -31,7 +31,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               2>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
diff --git a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
index cb0271c81f..796a5d3e9b 100644
--- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
@@ -7,7 +7,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
 #endif
     using namespace ck::literals;
 
-    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+    ProblemSize ps =
+        problem_size; // make mutable copy because default stride values of 0 need to be updated
+    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;
 
     auto f_host_tensor_descriptor =
         [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
@@ -41,6 +43,30 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
     std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
     std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
 
+    // If any user-provided leading stride <= 0, replace it with the one determined by the
+    // created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1.
+    auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int {
+        if constexpr(std::is_same_v<decltype(layout_tag), ck::tensor_layout::gemm::RowMajor>)
+        {
+            return static_cast<int>(tensor.GetStrides()[0]);
+        }
+        else
+        {
+            return static_cast<int>(tensor.GetStrides()[1]);
+        }
+    };
+
+    if(StrideA <= 0)
+        StrideA = fetch_leading_stride(a_m_k, ALayout{});
+    if(StrideB <= 0)
+        StrideB = fetch_leading_stride(b_k_n, BLayout{});
+    if(StrideD0 <= 0)
+        StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{});
+    if(StrideD1 <= 0)
+        StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{});
+    if(StrideE <= 0)
+        StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{});
+
     switch(config.init_method)
     {
     case 0: break;
diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt
index 91c072aef7..4f174bfcbb 100644
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -19,4 +19,13 @@ foreach(gpu IN LISTS GPU_TARGETS)
         add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
         set(target 1)
     endif()
-endforeach()
\ No newline at end of file
+endforeach()
+
+list(APPEND gpu_list_tf32 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
+        add_example_executable(example_convnd_fwd_xdl_fp32_tf32 convnd_fwd_xdl_fp32_tf32.cpp)
+        set(target 1)
+    endif()
+endforeach()
diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp
index b0fd6a382a..d82b56ec00 100644
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -27,10 +27,14 @@ void print_helper_msg()
               << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
 }
 
-template <typename DataType>
+template <typename DataType, typename GemmType = DataType>
 inline __host__ __device__ constexpr double get_rtol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
+    {
+        return 5e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
     {
         return 1e-3;
     }
@@ -68,10 +72,14 @@ inline __host__ __device__ constexpr double get_rtol()
     }
 }
 
-template <typename DataType>
+template <typename DataType, typename GemmType = DataType>
 inline __host__ __device__ constexpr double get_atol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
+    {
+        return 1e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
     {
         return 1e-3;
     }
@@ -116,7 +124,8 @@ template <ck::index_t NDimSpatial,
           typename InElementOp,
           typename WeiElementOp,
           typename OutElementOp,
-          typename DeviceConvNDFwdInstance>
+          typename DeviceConvNDFwdInstance,
+          typename ComputeDataType = OutDataType>
 bool run_grouped_conv_fwd(bool do_verification,
                           int init_method,
                           bool time_kernel,
@@ -228,7 +237,11 @@ bool run_grouped_conv_fwd(bool do_verification,
                                                                      OutDataType,
                                                                      InElementOp,
                                                                      WeiElementOp,
-                                                                     OutElementOp>();
+                                                                     OutElementOp,
+                                                                     0,
+                                                                     0,
+                                                                     0,
+                                                                     ComputeDataType>();
 
         auto ref_invoker  = ref_conv.MakeInvoker();
         auto ref_argument = ref_conv.MakeArgument(in,
@@ -249,8 +262,8 @@ bool run_grouped_conv_fwd(bool do_verification,
         return ck::utils::check_err(out_device,
                                     out_host,
                                     "Error: incorrect results!",
-                                    get_rtol<OutDataType>(),
-                                    get_atol<OutDataType>());
+                                    get_rtol<OutDataType, ComputeDataType>(),
+                                    get_atol<OutDataType, ComputeDataType>());
     }
 
     return true;
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
index b6bb03e1e5..6b66ebbdec 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -72,7 +72,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8>;
+        4>;
 
 #include "run_convnd_fwd_example.inc"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp
index 0fc9e7b5dd..d270d446b5 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,9 +73,17 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         ComputeType>;
 
 #include "run_convnd_fwd_example.inc"
 
-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp
index 9eba00993a..21bfd71a69 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
@@ -53,10 +53,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -74,10 +74,18 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeType,
         BComputeType>;
 
 #include "run_convnd_fwd_example.inc"
 
-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
index 064a971478..7db7fdf4a8 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -72,7 +72,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8>;
+        4>;
 
 #include "run_convnd_fwd_example.inc"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp
index 346ab8d953..62040384ad 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,9 +73,17 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         ComputeType>;
 
 #include "run_convnd_fwd_example.inc"
 
-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // fp8 are not supported on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
index 36517e569d..40c38b39d8 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
@@ -76,4 +76,11 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_convnd_fwd_example.inc"
 
-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp32_tf32.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp32_tf32.cpp
new file mode 100644
index 0000000000..348da7e1ef
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32_tf32.cpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
+using InDataType       = float;
+using WeiDataType      = float;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using OutDataType      = float;
+using ComputeDataType  = ck::tf32_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        ck::Tuple<>,      // DsLayout
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        ck::Tuple<>,      // DsDataType
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        1,                // NumGemmKPrefetchStage
+        256,              // BlockSize
+        128,              // MPerBlock
+        192,              // NPerBlock
+        16,               // KPerBlock
+        4,                // AK1
+        4,                // BK1
+        32,               // MPerXdl
+        32,               // NPerXdl
+        2,                // MXdlPerWave
+        3,                // NXdlPerWave
+        S<4, 64, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        4,                // ABlockTransferSrcScalarPerVector
+        4,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 64, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        4,                // BBlockTransferSrcScalarPerVector
+        4,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMXdlPerWavePerShuffle
+        1,                // CShuffleNXdlPerWavePerShuffle
+        S<1, 16, 1, 16>,  // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        4,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ComputeDataType,  // AComputeDataType
+        ComputeDataType,  // BComputeDataType
+        ck::LoopScheduler::Default, // LoopScheduler
+        1                           // NumGroupsToMerge
+        >;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp
index ef130148bc..c635d01d8f 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
@@ -7,6 +7,8 @@
 
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 
+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
 using InDataType       = ck::f8_t;
 using WeiDataType      = ck::f8_t;
 using AccDataType      = float;
@@ -52,10 +54,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,9 +75,19 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         ComputeDataType>;
 
 #include "run_convnd_fwd_example.inc"
 
-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp
index 53a12377c5..de6350db88 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
@@ -53,10 +53,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -74,10 +74,18 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeType,
         BComputeType>;
 
 #include "run_convnd_fwd_example.inc"
 
-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
index 0180e6e718..4ed47d2cae 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
         64,          // KPerBlock
         16,          // AK1
         16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -71,8 +71,8 @@ using DeviceGroupedConvNDFwdInstance =
         1,           // BBlockLdsExtraN
         1,
         1,
-        S<1, 64, 1, 4>,
-        16>;
+        S<1, 32, 1, 8>,
+        4>;
 
 #include "run_convnd_fwd_example.inc"
 
diff --git a/example/09_convnd_fwd/run_convnd_fwd_example.inc b/example/09_convnd_fwd/run_convnd_fwd_example.inc
index 49852ff667..016a189d4b 100644
--- a/example/09_convnd_fwd/run_convnd_fwd_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_example.inc
@@ -3,6 +3,11 @@
 
 #pragma once
 
+// use macro to minimize code change
+#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
+using ComputeDataType = AccDataType;
+#endif
+
 bool run_convnd_fwd_example(int argc, char* argv[])
 {
     print_helper_msg();
@@ -65,17 +70,17 @@ bool run_convnd_fwd_example(int argc, char* argv[])
             InElementOp,
             WeiElementOp,
             OutElementOp,
-            DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
+            DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>,
+            ComputeDataType>(do_verification,
+                             init_method,
+                             time_kernel,
+                             conv_param,
+                             in_g_n_c_wis_desc,
+                             wei_g_k_c_xs_desc,
+                             out_g_n_k_wos_desc,
+                             in_element_op,
+                             wei_element_op,
+                             out_element_op);
     };
 
     namespace ctc = ck::tensor_layout::convolution;
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
index 036f288d0a..7142521c55 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
@@ -125,7 +125,7 @@ inline bool parse_cmd_args(int argc,
 
         const ck::index_t num_dim_spatial = std::stoi(argv[4]);
         problem_size                      = ck::utils::conv::parse_conv_param(
-            num_dim_spatial, threshold_to_catch_partial_args, argv);
+            num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
     }
     else
     {
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
index 5848785673..c1ee36ef99 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -15,4 +15,11 @@ using RsDataType        = ck::Tuple<R0DataType>;
 
 #include "run_convnd_fwd_max_example.inc"
 
-int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return !run_convnd_fwd_max_example(argc, argv);
+}
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
index d61aee81a4..4b290d02a2 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
@@ -23,7 +23,7 @@ using RsGlobalReduceOp =
 static constexpr auto ConvSpec =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 
 // clang-format off
 template <ck::index_t NDimSpatial>
@@ -36,7 +36,7 @@ using DeviceInstance =
 #ifdef BUILD_INT4_EXAMPLE
         < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
 #else
-        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>,       ADataType,       BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>,       ADataType,       BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 #endif
 
 template <ck::index_t NDimSpatial>
diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp
index eb8b5c76d3..9e125c4e5d 100644
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
     const std::array<int, 2> reduceDims = {3, 4};
     // const std::array<int, 3> invariantDims = {0, 1, 2};
 
-    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
+    std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
 
     // input lengths of the second reduction, which is also the output lengths of the first
     // reduction
-    const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
+    std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
 
-    const std::vector<size_t> outLengths = {64, 320, 80};
+    std::vector<size_t> outLengths = {64, 320, 80};
 
     if(argc == 1)
     {
@@ -114,11 +114,26 @@ int main(int argc, char* argv[])
         init_method = 2;
         time_kernel = true;
     }
-    else if(argc == 4)
+    else if((argc == 4) || (argc == 9))
     {
         do_verify   = static_cast<bool>(argv[1]);
         init_method = atoi(argv[2]);
         time_kernel = static_cast<bool>(atoi(argv[3]));
+        if(argc == 9)
+        {
+            inLengths_1[0] = atoi(argv[4]);
+            inLengths_1[1] = atoi(argv[5]);
+            inLengths_1[2] = atoi(argv[6]);
+            inLengths_1[3] = atoi(argv[7]);
+            inLengths_1[4] = atoi(argv[8]);
+            inLengths_2[0] = inLengths_1[0];
+            inLengths_2[1] = inLengths_1[1];
+            inLengths_2[2] = inLengths_1[2];
+            inLengths_2[3] = inLengths_1[3];
+            outLengths[0]  = inLengths_1[0];
+            outLengths[1]  = inLengths_1[1];
+            outLengths[2]  = inLengths_1[2];
+        }
     }
     else
     {
diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
index 3ce08fd2af..abbf1b29f7 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -78,12 +78,12 @@ bool pool_test(bool do_verification,
 
             if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
             {
-                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz}, layout);
             }
             else if constexpr(ck::is_same<decltype(layout),
                                           ck::tensor_layout::convolution::NHWC>::value)
             {
-                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}, layout);
             }
         };
 
diff --git a/example/14_gemm_quantization/CMakeLists.txt b/example/14_gemm_quantization/CMakeLists.txt
index 8703fa3ed7..b058e7b0fa 100644
--- a/example/14_gemm_quantization/CMakeLists.txt
+++ b/example/14_gemm_quantization/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
+add_example_executable(example_gemm_wmma_quantization_int8 gemm_wmma_quantization_int8.cpp)
 add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
 add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
diff --git a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
index 2585072dfe..5291f5ce69 100644
--- a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
@@ -115,12 +115,14 @@ int main()
             if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
             {
                 return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1_uz}));
+                                            std::vector<std::size_t>({stride, 1_uz}),
+                                            layout);
             }
             else
             {
                 return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1_uz, stride}));
+                                            std::vector<std::size_t>({1_uz, stride}),
+                                            layout);
             }
         };
 
diff --git a/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp b/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp
new file mode 100644
index 0000000000..a3023997a1
--- /dev/null
+++ b/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = I8;
+
+using ALayout  = Col;
+using BLayout  = Row;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffleV3<
+    ALayout,
+    BLayout,
+    DsLayout,
+    ELayout,
+    ADataType,
+    BDataType,
+    DsDataType,
+    EDataType,
+    AccDataType,
+    CShuffleDataType,
+    ActivationOp,
+    ActivationOp,
+    CDEElementOp,
+    GemmDefault,
+    256,
+    128,
+    128,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<4, 64, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    true,
+    S<4, 64, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    true,
+    1,
+    1,
+    S<1, 32, 1, 8>,
+    S<1>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v1,
+    I8,
+    I8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
+
+int main(int /* argc */, char* /* argv */[])
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideE = N;
+
+    float requant_scale = 0.03;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op   = PassThrough{};
+    auto b_element_op   = PassThrough{};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    // device GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                      std::array<const void*, 0>{},
+                                      static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      std::array<ck::index_t, 0>{},
+                                      StrideE,
+                                      1,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
index aa3e011695..8f68ac6b05 100644
--- a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -70,10 +70,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
      64,                         // KPerBlock,
      16,                         // AK1,
      16,                         // BK1,
-     32,                         // MPerXDL,
-     32,                         // NPerXDL,
-     4,                          // MXdlPerWave,
-     2,                          // NXdlPerWave,
+     16,                         // MPerXDL,
+     16,                         // NPerXDL,
+     8,                          // MXdlPerWave,
+     4,                          // NXdlPerWave,
      S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
      S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
      S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
@@ -90,8 +90,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
      1,                          // bool BBlockLdsExtraN,
      1,                          // index_t CShuffleMXdlPerWavePerShuffle,
      1,                          // index_t CShuffleNXdlPerWavePerShuffle,
-     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     4>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
diff --git a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
index 4b207df5c6..db7f3cb091 100644
--- a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -68,10 +68,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
      64,                         // KPerBlock,
      16,                         // AK1,
      16,                         // BK1,
-     32,                         // MPerXDL,
-     32,                         // NPerXDL,
-     4,                          // MXdlPerWave,
-     2,                          // NXdlPerWave,
+     16,                         // MPerXDL,
+     16,                         // NPerXDL,
+     8,                          // MXdlPerWave,
+     4,                          // NXdlPerWave,
      S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
      S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
      S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
@@ -88,8 +88,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
      1,                          // bool BBlockLdsExtraN,
      1,                          // index_t CShuffleMXdlPerWavePerShuffle,
      1,                          // index_t CShuffleNXdlPerWavePerShuffle,
-     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     4>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
index 63a2aea0b3..c8de51f550 100644
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -63,7 +63,7 @@ using DeviceGemmInstance =
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,    S<4, 64, 1>,     S<1, 0, 2>,      S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,       S<4,4,4>>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   16,   16,    2,    4,    S<4, 64, 1>,     S<1, 0, 2>,      S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,       S<4,4,4>>;
 // clang-format on
 
 struct ProblemSize final
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp
index 680cee1f81..ac64a468a4 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on
 
 #include "run_grouped_gemm_example.inc"
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
index 5bdc993192..2fcc0e3cb1 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
@@ -323,6 +323,31 @@ int main(int argc, char* argv[])
 
     problem_size.Ms = {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
 
+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.k_batch           = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (>0)\n");
+        printf("arg5: group count (default=16)");
+        exit(0);
+    }
+
     for(int i = 0; i < problem_size.group_count; i++)
     {
         problem_size.Ns.push_back(768);
@@ -333,21 +358,5 @@ int main(int argc, char* argv[])
         problem_size.stride_Cs.push_back(problem_size.Ns[i]);
     }
 
-    if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.k_batch         = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: k_batch (>0)\n");
-        exit(0);
-    }
-
     return !run_grouped_gemm(problem_size, config);
 }
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
index 6806bd1886..fb611fd444 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
@@ -296,6 +296,32 @@ int main(int argc, char* argv[])
 
     problem_size.group_count = 16;
 
+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.k_batch           = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (> 0)\n");
+        printf("arg5: group count (default=16)");
+
+        exit(0);
+    }
+
     for(int i = 0; i < problem_size.group_count; i++)
     {
         problem_size.Ms.push_back(128 + rand() % 128);
@@ -307,21 +333,5 @@ int main(int argc, char* argv[])
         problem_size.stride_Cs.push_back(problem_size.Ns[i]);
     }
 
-    if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.k_batch         = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: k_batch (> 0)\n");
-        exit(0);
-    }
-
     return !run_grouped_gemm(problem_size, config);
 }
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
index 8418c10f5e..47eb6637bd 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
@@ -297,6 +297,31 @@ int main(int argc, char* argv[])
 
     problem_size.group_count = 16;
 
+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.k_batch           = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (> 0)\n");
+        printf("arg5: group count (default=16)");
+        exit(0);
+    }
+
     for(int i = 0; i < problem_size.group_count; i++)
     {
         problem_size.Ms.push_back(256 + 256 * i);
@@ -308,21 +333,5 @@ int main(int argc, char* argv[])
         problem_size.stride_Cs.push_back(problem_size.Ns[i]);
     }
 
-    if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.k_batch         = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: k_batch (> 0)\n");
-        exit(0);
-    }
-
     return !run_grouped_gemm(problem_size, config);
 }
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
index 90a12bc1dd..85ea8c2f2c 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on
 
 #include "run_grouped_gemm_example.inc"
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
index 28b0fcd0ce..fb047ae364 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               2>;
 // clang-format on
 
 #include "run_grouped_gemm_example.inc"
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
index 9f8f6cb1e4..16d018936b 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
@@ -66,6 +66,28 @@ int main(int argc, char* argv[])
 
     problem_size.group_count = 16;
 
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 5)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        problem_size.group_count = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: group count (default=16)");
+        exit(0);
+    }
+
     for(int i = 0; i < problem_size.group_count; i++)
     {
         problem_size.Ms.push_back(256 + 256 * i);
@@ -77,19 +99,5 @@ int main(int argc, char* argv[])
         problem_size.stride_Cs.push_back(problem_size.Ns[i]);
     }
 
-    if(argc == 4)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        exit(0);
-    }
-
     return !run_grouped_gemm(problem_size, config);
 }
diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc
index 7186c22233..4ef6074f4a 100644
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -278,6 +278,30 @@ bool run_grouped_gemm_example(int argc, char* argv[])
 
     problem_size.group_count = 16;
 
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.async_hargs       = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: async hargs (0=n0, 1=yes)\n");
+        printf("arg5: group count (default=16)");
+        exit(0);
+    }
+
     for(int i = 0; i < problem_size.group_count; i++)
     {
         problem_size.Ms.push_back(256 + 256 * i);
@@ -288,27 +312,6 @@ bool run_grouped_gemm_example(int argc, char* argv[])
         problem_size.stride_Bs.push_back(problem_size.Ks[i]);
         problem_size.stride_Cs.push_back(problem_size.Ns[i]);
     }
-    if(argc == 4)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.async_hargs     = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: async hargs (0=n0, 1=yes)\n");
-        exit(0);
-    }
 
     return run_grouped_gemm(problem_size, config);
 }
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
index a46eaa4816..3cc38b381b 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -76,7 +76,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
 //######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
index b28e7f85d3..3de32e9a6d 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
@@ -72,10 +72,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          64,                        // KPerBlock
          16,                        // AK1
          16,                        // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
          S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
          S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
          S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          1,                         // BBlockLdsExtraN
          1,                         // CShuffleMXdlPerWavePerShuffle
          1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
          4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
          1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
index b30ce2c48a..0290c1829d 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
@@ -65,10 +65,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          32,                        // KPerBlock
          8,                         // AK1
          8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
          S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
          S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
          S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -85,7 +85,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          1,                         // BBlockLdsExtraN
          1,                         // CShuffleMXdlPerWavePerShuffle
          1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
          4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
          1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
index 31e2efd6f6..e211a63b0b 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
@@ -65,10 +65,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          32,                        // KPerBlock
          8,                         // AK1
          8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
          S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
          S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
          S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -85,7 +85,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          1,                         // BBlockLdsExtraN
          1,                         // CShuffleMXdlPerWavePerShuffle
          1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
          4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
          1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
index d3c7c1d99c..90c2cdcdaa 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
@@ -146,6 +146,11 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+
     return run_gemm_reduce_max_xdl<ADataType,
                                    BDataType,
                                    EDataType,
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
index 9a4a6bc6e1..e8594e206b 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
index 1a8457a8bf..ee274f3a55 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
@@ -64,10 +64,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          64,                        // KPerBlock
          16,                        // AK1
          16,                        // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
          S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
          S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
          S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -84,7 +84,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          1,                         // BBlockLdsExtraN
          1,                         // CShuffleMXdlPerWavePerShuffle
          1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
          4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
          1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
index 5c2706c79a..3ee3037179 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
@@ -72,10 +72,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          32,                        // KPerBlock
          8,                         // AK1
          8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
          S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
          S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
          S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          1,                         // BBlockLdsExtraN
          1,                         // CShuffleMXdlPerWavePerShuffle
          1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
          4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
          1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
index c119e24370..9ce1e76cf5 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
@@ -72,10 +72,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          32,                        // KPerBlock
          8,                         // AK1
          8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
          S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
          S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
          S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
          1,                         // BBlockLdsExtraN
          1,                         // CShuffleMXdlPerWavePerShuffle
          1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
          4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
          1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
index 0f5e588383..7815d2beea 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
@@ -153,6 +153,11 @@ int main(int argc, char* argv[])
         exit(EXIT_SUCCESS);
     }
 
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        exit(EXIT_SUCCESS);
+    }
+
     return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
                                                 BDataType,
                                                 EDataType,
diff --git a/example/18_batched_gemm_reduce/CMakeLists.txt b/example/18_batched_gemm_reduce/CMakeLists.txt
index 03ba0a65df..1d1f255187 100644
--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
@@ -1,8 +1 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
-   set(target 1)
- endif()
-endforeach()
+add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
index 42bfea372e..4a701e7792 100644
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -64,7 +64,7 @@ using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatc
 //######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|               |                |   MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
 //######|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|               |                |    Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
 //######|        |        |        |     |      |      |         |         |          |              |            |            |            |            |               |                |             |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal,  AElementOp,  BElementOp,  CElementOp, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal,  AElementOp,  BElementOp,  CElementOp, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4,             S<32, 8>,                         4,                            1>;
 // clang-format on
 
 using ReferenceBatchedGemmInstance =
@@ -137,11 +137,13 @@ int main(int argc, char* argv[])
 
         if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {row * stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {col * stride, 1_uz, stride}, layout);
         }
     };
 
diff --git a/example/20_grouped_conv_bwd_weight/common.hpp b/example/20_grouped_conv_bwd_weight/common.hpp
index e0034bf7eb..9159e51eaf 100644
--- a/example/20_grouped_conv_bwd_weight/common.hpp
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
@@ -123,7 +123,9 @@ inline bool parse_cmd_args(int argc,
 
         const ck::index_t num_dim_spatial = std::stoi(argv[4]);
         conv_param                        = ck::utils::conv::parse_conv_param(
-            num_dim_spatial, threshold_to_catch_partial_args, argv);
+            num_dim_spatial,
+            threshold_to_catch_partial_args + 1, // +1 because we already parsed num_dim_spatial
+            argv);
     }
     else
     {
diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
index 5dccb11bba..abbc7a946c 100644
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -80,7 +80,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
 //######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
@@ -236,7 +236,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
               << " GB/s, " << std::endl;
 }
 
-int main()
+int main(int argc, char* argv[])
 {
     // GEMM shape
     ck::index_t M = 1024;
@@ -249,6 +249,25 @@ int main()
     ck::index_t StrideD1 = 1024;
     ck::index_t StrideE  = 1024;
 
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[2]));
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
     Tensor<D0DataType> bias_n(f_host_tensor_descriptor1d(N, 1));
@@ -357,6 +376,7 @@ int main()
     normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
 
     bool pass = true;
+    if(do_verification)
     {
         // verification
         Tensor<LayerNormOutDataType> host_layerNorm_m_n(
@@ -383,27 +403,25 @@ int main()
                                      1e-2);
     }
 
+    if(time_kernel)
     {
         // evaluate kernel perf
-        bool time_kernel = true;
-
         float gemm_reduce_mean_reduce_square_mean_ave_time =
-            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
+            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
         float normalize_ave_time =
-            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});
 
-        if(time_kernel)
-            DumpGemmLayerNormPerf<ADataType,
-                                  BDataType,
-                                  EDataType,
-                                  D0DataType,
-                                  D1DataType,
-                                  R0DataType,
-                                  R1DataType,
-                                  GammaDataType,
-                                  BetaDataType,
-                                  LayerNormOutDataType>(
-                gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
+        DumpGemmLayerNormPerf<ADataType,
+                              BDataType,
+                              EDataType,
+                              D0DataType,
+                              D1DataType,
+                              R0DataType,
+                              R1DataType,
+                              GammaDataType,
+                              BetaDataType,
+                              LayerNormOutDataType>(
+            gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
     }
 
     return pass ? 0 : 1;
diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
index 6a92e9a2f5..ae5e3f36ad 100644
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -65,7 +65,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDLayern
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|             Type|          Type|         Type|      Type| Elementwise| Elementwise|  Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|  ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|
 //######|        |        |         |        |          |          |            |                 |           |                 |              |             |          |   Operation|   Operation|    Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                  _M_N|            _M_N|                 _M_N|              _M|
 //######|        |        |         |        |          |          |            |                 |           |                 |              |             |          |            |            |             |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                      |                |                     |                |
-        < ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType,  AElementOp,  BElementOp, CDEElementOp,   HElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<32, 8>,               8,             S<8, 32>,               8>;
+        < ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType,  AElementOp,  BElementOp, CDEElementOp,   HElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<32, 8>,               4,             S<8, 32>,               4>;
 // clang-format on
 
 auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
@@ -154,6 +154,12 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
 
 int main()
 {
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+
     bool do_verification = true;
 
     // GEMM shape
diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
index 168193ad5b..23c602c39e 100644
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -77,7 +77,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
 //######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
@@ -221,7 +221,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
               << " GB/s, " << std::endl;
 }
 
-int main()
+int main(int argc, char* argv[])
 {
     // GEMM shape
     ck::index_t M = 1024;
@@ -232,6 +232,25 @@ int main()
     ck::index_t StrideB = 1024;
     ck::index_t StrideE = 1024;
 
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[2]));
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
     Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
@@ -333,6 +352,7 @@ int main()
     normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
 
     bool pass = true;
+    if(do_verification)
     {
         // verification
         Tensor<LayerNormOutDataType> host_layerNorm_m_n(
@@ -354,25 +374,23 @@ int main()
             layerNorm_m_n, host_layerNorm_m_n, "Error: Incorrect results d1", 1e-3, 1e-3);
     }
 
+    if(time_kernel)
     {
         // evaluate kernel perf
-        bool time_kernel = true;
-
         float gemm_reduce_mean_reduce_square_mean_ave_time =
-            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
+            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
         float normalize_ave_time =
-            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});
 
-        if(time_kernel)
-            DumpGemmLayerNormPerf<ADataType,
-                                  BDataType,
-                                  EDataType,
-                                  R0DataType,
-                                  R1DataType,
-                                  GammaDataType,
-                                  BetaDataType,
-                                  LayerNormOutDataType>(
-                gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
+        DumpGemmLayerNormPerf<ADataType,
+                              BDataType,
+                              EDataType,
+                              R0DataType,
+                              R1DataType,
+                              GammaDataType,
+                              BetaDataType,
+                              LayerNormOutDataType>(
+            gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
     }
 
     return pass ? 0 : 1;
diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
index 277fea0272..10d90b795c 100644
--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -70,7 +70,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl
 //######|        |        |        |      Type|      Type|      Type|       Type|    DataType|         DataType|    DataType| Elementwise| Elementwise|  Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector|
 //######|        |        |        |          |          |          |           |            |                 |            |   Operation|   Operation|    Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|            _NPerBlock|
 //######|        |        |        |          |          |          |           |            |                 |            |            |            |             |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                      |
-        <     Row,     Col,     Row, ADataType, BDataType, CDataType, C0DataType, AccDataType, CShuffleDataType, AccDataType,  AElementOp,  BElementOp, AccElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8,             S<64, 4>,                     4>;
+        <     Row,     Col,     Row, ADataType, BDataType, CDataType, C0DataType, AccDataType, CShuffleDataType, AccDataType,  AElementOp,  BElementOp, AccElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           4,               S<1, 32, 1, 8>,               8,             S<32, 8>,                     4>;
 // clang-format on
 
 using ReferenceInstance = ck::tensor_operation::host::ReferenceGemmLayernorm<ADataType,
diff --git a/example/22_cgemm/cgemm_xdl_bf16.cpp b/example/22_cgemm/cgemm_xdl_bf16.cpp
index fa4482a984..716d36b487 100644
--- a/example/22_cgemm/cgemm_xdl_bf16.cpp
+++ b/example/22_cgemm/cgemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
@@ -48,10 +48,10 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
      32,                         // index_t KPerBlock
      8,                          // index_t AK1
      8,                          // index_t BK1
-     32,                         // index_t MPerXDL
-     32,                         // index_t NPerXDL
-     4,                          // index_t MXdlPerWave
-     2,                          // index_t NXdlPerWave
+     16,                         // index_t MPerXDL
+     16,                         // index_t NPerXDL
+     8,                          // index_t MXdlPerWave
+     4,                          // index_t NXdlPerWave
      S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
      S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
      S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
@@ -69,7 +69,7 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
      1,                          // index_t CShuffleMXdlPerWavePerShuffle
      1,                          // index_t CShuffleNXdlPerWavePerShuffle
      S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+     4>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
 // clang-format on
 
 int main(int argc, char* argv[])
diff --git a/example/22_cgemm/cgemm_xdl_fp16.cpp b/example/22_cgemm/cgemm_xdl_fp16.cpp
index 89a581e865..2996d87b28 100644
--- a/example/22_cgemm/cgemm_xdl_fp16.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
@@ -47,10 +47,10 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
      32,                         // index_t KPerBlock
      8,                          // index_t AK1
      8,                          // index_t BK1
-     32,                         // index_t MPerXDL
-     32,                         // index_t NPerXDL
-     4,                          // index_t MXdlPerWave
-     2,                          // index_t NXdlPerWave
+     16,                         // index_t MPerXDL
+     16,                         // index_t NPerXDL
+     8,                          // index_t MXdlPerWave
+     4,                          // index_t NXdlPerWave
      S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
      S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
      S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
@@ -68,7 +68,7 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
      1,                          // index_t CShuffleMXdlPerWavePerShuffle
      1,                          // index_t CShuffleNXdlPerWavePerShuffle
      S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+     4>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
 // clang-format on
 
 int main(int argc, char* argv[])
diff --git a/example/24_batched_gemm/run_batched_gemm_example.inc b/example/24_batched_gemm/run_batched_gemm_example.inc
index 741512bf00..c93a2051d2 100644
--- a/example/24_batched_gemm/run_batched_gemm_example.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example.inc
@@ -59,11 +59,13 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 
         if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
         {
-            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
diff --git a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
index 3582bc5e33..ac34ed5b8a 100644
--- a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
@@ -137,11 +137,13 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
                                        auto layout) {
         if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
         {
-            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
diff --git a/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
index 778be8ffd7..9939429a08 100644
--- a/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
@@ -64,11 +64,13 @@ bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionCo
 
         if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
         {
-            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
index 420a7cf74f..4f4003809b 100644
--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
@@ -19,6 +19,9 @@
 
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
@@ -247,11 +250,11 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
-    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
-    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -342,7 +345,8 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
+            e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
 
         using ReferenceOpInstance = ReferenceContraction_G1_M2_N3_K1<NumDimM,
                                                                      NumDimN,
diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
index 9d606db205..e4881da0f5 100644
--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
@@ -17,6 +17,9 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
@@ -247,11 +250,11 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
-    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
-    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -342,7 +345,8 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
+            e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
 
         using ReferenceOpInstance = ReferenceContraction_G1_M3_N2_K1<NumDimG,
                                                                      NumDimM,
diff --git a/example/26_contraction/run_contraction_bilinear_example.inc b/example/26_contraction/run_contraction_bilinear_example.inc
index 7b9a01e0ae..ddb6d678e2 100644
--- a/example/26_contraction/run_contraction_bilinear_example.inc
+++ b/example/26_contraction/run_contraction_bilinear_example.inc
@@ -15,6 +15,8 @@
 #include "ck/library/utility/numeric.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 
+using Row = ck::tensor_layout::gemm::RowMajor;
+
 int run_contraction_bilinear_example(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -95,11 +97,11 @@ int run_contraction_bilinear_example(int argc, char* argv[])
         exit(0);
     }
 
-    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
-    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides, Row{});
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
+    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides, Row{});
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
     std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
     std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
@@ -189,7 +191,7 @@ int run_contraction_bilinear_example(int argc, char* argv[])
 
     if(do_verification)
     {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
         using ReferenceOpInstance =
             ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
diff --git a/example/26_contraction/run_contraction_scale_example.inc b/example/26_contraction/run_contraction_scale_example.inc
index 65ca182da2..0a7287edf9 100644
--- a/example/26_contraction/run_contraction_scale_example.inc
+++ b/example/26_contraction/run_contraction_scale_example.inc
@@ -15,6 +15,8 @@
 #include "ck/library/utility/numeric.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 
+using Row = ck::tensor_layout::gemm::RowMajor;
+
 int run_contraction_scale_example(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -85,10 +87,10 @@ int run_contraction_scale_example(int argc, char* argv[])
         exit(0);
     }
 
-    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
-    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides, Row{});
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
     std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
     std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
@@ -173,7 +175,7 @@ int run_contraction_scale_example(int argc, char* argv[])
 
     if(do_verification)
     {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
         using ReferenceOpInstance =
             ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
index 24e9b1d9b7..70c4a01185 100644
--- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -18,6 +18,9 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
@@ -53,7 +56,7 @@ using DeviceOpInstanceKKNN = ck::tensor_operation::device::
         //############################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //############################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //############################################|        |        |        |      |      |        |         |           |      |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               8>;
+        DeviceGroupedContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               4>;
 // clang-format on
 
 // hardcoded for NumDimM == NumDimN == NumDimK == 2
@@ -194,22 +197,28 @@ int main(int argc, char* argv[])
     int init_method      = 1;
     bool time_kernel     = false;
 
-    if(argc == 4)
+    std::size_t group_count = rand() % 16 + 1;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 5)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
         time_kernel     = std::stoi(argv[3]);
+        group_count     = std::stoi(argv[4]);
     }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
         printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: group count (default = random from 1..16)");
         exit(0);
     }
 
-    std::size_t group_count = rand() % 16 + 1;
-
     // GEMM shape
     std::vector<ck::tensor_operation::device::ContractionDesc<1>> contraction_descs;
     std::vector<const void*> p_a, p_b;
@@ -298,10 +307,10 @@ int main(int argc, char* argv[])
         const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
         const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
 
-        Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
-        Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-        Tensor<DDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
-        Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+        Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides, Row{});
+        Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
+        Tensor<DDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides, Bypass{});
+        Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
         ck::index_t M_ =
             ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
@@ -410,9 +419,9 @@ int main(int argc, char* argv[])
             const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
             const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
 
-            Tensor<EDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+            Tensor<EDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
-            Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+            Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
             e_tensors_device[i]->FromDevice(e_device_tensors[i].mData.data());
 
diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
index f556be887f..c4cb7a13a2 100644
--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
@@ -17,6 +17,9 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
@@ -300,11 +303,11 @@ int main(int argc, char* argv[])
     std::vector<ck::index_t> e_gs_ms_ns_strides{
         G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1};
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
-    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
-    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
     std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
@@ -396,7 +399,8 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result(
+            e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
 
         using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
                                                                      NumDimM,
diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
index 08158bfc25..2b9afc342e 100644
--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -17,6 +17,9 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
@@ -54,7 +57,7 @@ using DeviceOpInstanceKKNN = ck::tensor_operation::device::
         //############################################|        |        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //############################################|        |        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //############################################|        |        |        |        |      |      |        |         |           |      |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               8>;
+        DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               4>;
 // clang-format on
 
 using DeviceOpInstance = DeviceOpInstanceKKNN;
@@ -247,11 +250,11 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
-    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
-    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -345,7 +348,8 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result(
+            e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
 
         using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
                                                                      NumDimM,
diff --git a/example/30_grouped_conv_fwd_multiple_d/common.hpp b/example/30_grouped_conv_fwd_multiple_d/common.hpp
index 39463614f5..92fdaa7f33 100644
--- a/example/30_grouped_conv_fwd_multiple_d/common.hpp
+++ b/example/30_grouped_conv_fwd_multiple_d/common.hpp
@@ -160,7 +160,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
                 conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
                 1,                                                                    // c
                 conv_param.G_ * conv_param.C_                                         // wi
-            });
+            },
+            ck::tensor_layout::convolution::GNCW{});
 
     case 2:
         return HostTensorDescriptor(
@@ -176,7 +177,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
                 1,                                                                    // c
                 conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
                 conv_param.G_ * conv_param.C_                                         // wi
-            });
+            },
+            ck::tensor_layout::convolution::GNCHW{});
 
     case 3:
         return HostTensorDescriptor(
@@ -195,7 +197,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
                     conv_param.G_ * conv_param.C_,                                    // di
                 conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                 conv_param.G_ * conv_param.C_                                         // wi
-            });
+            },
+            ck::tensor_layout::convolution::GNCDHW{});
     }
 
     throw std::runtime_error("unsuppored # dim spatial");
@@ -213,7 +216,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
                 conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
                 1,                                                                     // c
                 conv_param.C_                                                          // x
-            });
+            },
+            ck::tensor_layout::convolution::GKCX{});
     case 2:
         return HostTensorDescriptor(
             {conv_param.G_,
@@ -229,7 +233,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
                 1,                                                     // c
                 conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
                 conv_param.C_                                          // x
-            });
+            },
+            ck::tensor_layout::convolution::GKCYX{});
     case 3:
         return HostTensorDescriptor(
             {conv_param.G_,
@@ -249,7 +254,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
                     conv_param.C_,                                     // z
                 conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                 conv_param.C_                                          // x
-            });
+            },
+            ck::tensor_layout::convolution::GKCZYX{});
     }
 
     throw std::runtime_error("unsuppored # dim spatial");
@@ -267,7 +273,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
                 0,             // k
                 1,             // c
                 0              // x
-            });
+            },
+            ck::tensor_layout::convolution::GNKW{});
     case 2:
         return HostTensorDescriptor({conv_param.G_,
                                      conv_param.N_,
@@ -280,7 +287,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
                                         1,             // k
                                         0,             // ho
                                         0              // wo
-                                    });
+                                    },
+                                    ck::tensor_layout::convolution::GNKHW{});
     case 3:
         return HostTensorDescriptor({conv_param.G_,
                                      conv_param.N_,
@@ -295,7 +303,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
                                         0,             // z
                                         0,             // y
                                         0              // x
-                                    });
+                                    },
+                                    ck::tensor_layout::convolution::GNKDHW{});
     }
 
     throw std::runtime_error("unsuppored # dim spatial");
@@ -314,7 +323,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
                 conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
                 1,                                                                     // k
                 conv_param.G_ * conv_param.K_                                          // wo
-            });
+            },
+            ck::tensor_layout::convolution::GNKW{});
     case 2:
         return HostTensorDescriptor(
             {conv_param.G_,
@@ -329,7 +339,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
                 1,                                                                     // k
                 conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
                 conv_param.G_ * conv_param.K_                                          // wo
-            });
+            },
+            ck::tensor_layout::convolution::GNKHW{});
 
     case 3:
         return HostTensorDescriptor(
@@ -348,7 +359,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
                     conv_param.G_ * conv_param.K_,                                     // do
                 conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                 conv_param.G_ * conv_param.K_                                          // wo
-            });
+            },
+            ck::tensor_layout::convolution::GNKDHW{});
     }
 
     throw std::runtime_error("unsuppored # dim spatial");
diff --git a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
index ae769ff1d3..56b0e6d1dc 100644
--- a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
+++ b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
@@ -160,7 +160,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
                 conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
                 1,                                                                    // c
                 conv_param.G_ * conv_param.C_                                         // wi
-            });
+            },
+            ck::tensor_layout::convolution::GNCW{});
 
     case 2:
         return HostTensorDescriptor(
@@ -176,7 +177,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
                 1,                                                                    // c
                 conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
                 conv_param.G_ * conv_param.C_                                         // wi
-            });
+            },
+            ck::tensor_layout::convolution::GNCHW{});
 
     case 3:
         return HostTensorDescriptor(
@@ -195,7 +197,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
                     conv_param.G_ * conv_param.C_,                                    // di
                 conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                 conv_param.G_ * conv_param.C_                                         // wi
-            });
+            },
+            ck::tensor_layout::convolution::GNCDHW{});
     }
 
     throw std::runtime_error("unsuppored # dim spatial");
@@ -213,7 +216,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
                 conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
                 1,                                                                     // c
                 conv_param.C_                                                          // x
-            });
+            },
+            ck::tensor_layout::convolution::GKCX{});
     case 2:
         return HostTensorDescriptor(
             {conv_param.G_,
@@ -229,7 +233,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
                 1,                                                     // c
                 conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
                 conv_param.C_                                          // x
-            });
+            },
+            ck::tensor_layout::convolution::GKCYX{});
     case 3:
         return HostTensorDescriptor(
             {conv_param.G_,
@@ -249,7 +254,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
                     conv_param.C_,                                     // z
                 conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                 conv_param.C_                                          // x
-            });
+            },
+            ck::tensor_layout::convolution::GKCZYX{});
     }
 
     throw std::runtime_error("unsuppored # dim spatial");
@@ -267,7 +273,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
                 0,             // k
                 1,             // c
                 0              // x
-            });
+            },
+            ck::tensor_layout::convolution::GNKW{});
     case 2:
         return HostTensorDescriptor({conv_param.G_,
                                      conv_param.N_,
@@ -280,7 +287,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
                                         1,             // k
                                         0,             // ho
                                         0              // wo
-                                    });
+                                    },
+                                    ck::tensor_layout::convolution::GNKHW{});
     case 3:
         return HostTensorDescriptor({conv_param.G_,
                                      conv_param.N_,
@@ -295,7 +303,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
                                         0,             // z
                                         0,             // y
                                         0              // x
-                                    });
+                                    },
+                                    ck::tensor_layout::convolution::GNKDHW{});
     }
 
     throw std::runtime_error("unsuppored # dim spatial");
@@ -314,7 +323,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
                 conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
                 1,                                                                     // k
                 conv_param.G_ * conv_param.K_                                          // wo
-            });
+            },
+            ck::tensor_layout::convolution::GNKW{});
     case 2:
         return HostTensorDescriptor(
             {conv_param.G_,
@@ -329,7 +339,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
                 1,                                                                     // k
                 conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
                 conv_param.G_ * conv_param.K_                                          // wo
-            });
+            },
+            ck::tensor_layout::convolution::GNKHW{});
 
     case 3:
         return HostTensorDescriptor(
@@ -348,7 +359,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
                     conv_param.G_ * conv_param.K_,                                     // do
                 conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                 conv_param.G_ * conv_param.K_                                          // wo
-            });
+            },
+            ck::tensor_layout::convolution::GNKDHW{});
     }
 
     throw std::runtime_error("unsuppored # dim spatial");
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
index 43c0d57dc2..6bc0e69fee 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
index 40b4132b35..bc919c1a56 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
index e05d384f26..b395962b63 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -23,4 +23,11 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 
 #include "run_grouped_conv_fwd_bias_relu_add_example.inc"
 
-int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv);
+}
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
index 6bf2e8d963..d616948cc0 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
index 498eda2442..b6b168657d 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
index 627e20e245..8dcc9e21ad 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
@@ -108,10 +108,10 @@ using DeviceConvFwdInstance =
         32,          // KPerBlock
         4,           // AK1
         4,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        2,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        4,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -128,7 +128,7 @@ using DeviceConvFwdInstance =
         1,           // BBlockLdsExtraN
         1,
         1,
-        S<1, 16, 1, 16>,
+        S<1, 32, 1, 8>,
         4>;
 
 template <ck::index_t NDimSpatial>
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
index da65bb1886..c661871dfa 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
@@ -27,10 +27,10 @@ using DeviceConvFwdInstance =
         16,          // KPerBlock
         4,           // AK1
         4,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -47,7 +47,7 @@ using DeviceConvFwdInstance =
         1,           // BBlockLdsExtraN
         1,
         1,
-        S<1, 16, 1, 16>,
+        S<1, 32, 1, 8>,
         4>;
 
 template <ck::index_t NDimSpatial>
diff --git a/example/31_batched_gemm_gemm/CMakeLists.txt b/example/31_batched_gemm_gemm/CMakeLists.txt
index 3e8c9afd9d..811b133b44 100644
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
@@ -1,10 +1,16 @@
 add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
 add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
 add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
+
+add_example_executable(example_batched_gemm_gemm_wmma_cshuffle_v3_bf16 batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp)
+add_example_executable(example_batched_gemm_gemm_wmma_cshuffle_v3_fp8 batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp)
+add_example_executable(example_batched_gemm_gemm_wmma_cshuffle_v3_fp16 batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp)
+add_example_executable(example_batched_gemm_gemm_wmma_cshuffle_v3_int8 batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp)
+
 if(USE_BITINT_EXTENSION_INT4)
    add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
 endif(USE_BITINT_EXTENSION_INT4)
 
-if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx95" AND NOT GPU_TARGETS MATCHES "gfx1")
+if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx95")
    add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
 endif()
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_base.inc b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_base.inc
new file mode 100644
index 0000000000..de4f5f09e7
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_base.inc
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Gemm fused operation. Computes C_g_m_n = (A_g_m_k * B0_g_k_l) * B1_g_l_n
+                                                |------------------|
+                                                        Gemm0
+                                                |-----------------------------|
+                                                             Gemm1
+*/
+
+static constexpr auto PipeSched   = ck::BlockGemmPipelineScheduler::Interwave;
+static constexpr auto PipelineVer = ck::BlockGemmPipelineVersion::v1;
+static constexpr auto GemmSpec    = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+// clang-format off
+// #define CK_MHA_USE_RCCR_LAYOUT
+#define CK_MHA_USE_WAVE_1
+// #define CK_MHA_USE_WAVE_2
+// #define CK_MHA_USE_WAVE_4
+// #define CK_MHA_USE_WAVE_8
+
+#ifdef CK_MHA_USE_RCCR_LAYOUT
+using DeviceMHAFactory = 
+    std::tuple<
+        ck::tensor_operation::device::DeviceBatchedGemmGemm_Wmma_CShuffleV3<
+            Row, Col, Col, Row, 
+            ADataType, B0DataType, B1DataType, CDataType, AccDataType, CShuffleDataType,
+            AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+            GemmSpec,
+            32,
+            //      Gemm 0
+            16, 64, 64, 64, 64, 8,  8,
+            //      Gemm 1
+                8,  
+            16, 16,
+            // Per repeat = wave_m = wave_num, wave_n = 1
+            1, 4, 4,
+            // ABlockTransfer MK -> K0 M K1
+            S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false,
+            // B0BlockTransfer LK -> K0 L K1
+            S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false,
+            // B1BlockTransfer NL -> L0 N L1
+            S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // CShuffleBlockTransfer MN
+            1, 1, S<1, 16, 1, 2>, 8,
+            PipeSched, PipelineVer>
+    >;
+#else
+using DeviceMHAFactory = 
+    std::tuple<
+#ifdef CK_MHA_USE_WAVE_1
+        // 1 wave, mrepeat = 1, nrepeat = 2, k/o repeat = 1~5
+        ck::tensor_operation::device::DeviceBatchedGemmGemm_Wmma_CShuffleV3<
+            Row, Col, Row, Row, 
+            ADataType, B0DataType, B1DataType, CDataType, AccDataType, CShuffleDataType,
+            AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+            GemmSpec,
+            32,
+            //      Gemm 0
+            16, 128, 64, 64, 64, 8,  8,
+            //      Gemm 1
+                8,  
+            16, 16,
+            // Per repeat = wave_m = wave_num, wave_n = 1
+            1, 8, 4,
+            // ABlockTransfer MK -> K0 M K1
+            S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B0BlockTransfer LK -> K0 L K1
+            S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B1BlockTransfer NL -> L0 N L1
+            S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false,
+            // CShuffleBlockTransfer MN
+            1, 1, S<1, 16, 1, 2>, 8, 
+            PipeSched, PipelineVer>,
+        ck::tensor_operation::device::DeviceBatchedGemmGemm_Wmma_CShuffleV3<
+            Row, Col, Row, Row, 
+            ADataType, B0DataType, B1DataType, CDataType, AccDataType, CShuffleDataType,
+            AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+            GemmSpec,
+            32,
+            //      Gemm 0
+            16, 64, 64, 64, 64, 8,  8,
+            //      Gemm 1
+                8,  
+            16, 16,
+            // Per repeat = wave_m = wave_num, wave_n = 1
+            1, 4, 4,
+            // ABlockTransfer MK -> K0 M K1
+            S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B0BlockTransfer LK -> K0 L K1
+            S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B1BlockTransfer NL -> L0 N L1
+            S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false,
+            // CShuffleBlockTransfer MN
+            1, 1, S<1, 16, 1, 2>, 8,
+            PipeSched, PipelineVer>
+#endif
+#ifdef CK_MHA_USE_WAVE_2
+        ck::tensor_operation::device::DeviceBatchedGemmGemm_Wmma_CShuffleV3<
+            Row, Col, Row, Row, 
+            ADataType, B0DataType, B1DataType, CDataType, AccDataType, CShuffleDataType,
+            AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+            GemmSpec,
+            64,
+            //      Gemm 0
+            32, 128, 64, 64, 64, 8, 8,
+            //      Gemm 1
+                 8,  
+            16, 16,
+            // Per repeat = wave_m = wave_num, wave_n = 1
+            1, 8, 4,
+            // ABlockTransfer MK -> K0 M K1
+            S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B0BlockTransfer LK -> K0 L K1
+            S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B1BlockTransfer NL -> L0 N L1
+            S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false,
+            // CShuffleBlockTransfer MN
+            1, 1, S<1, 32, 1, 2>, 8,
+            PipeSched, PipelineVer>,
+        ck::tensor_operation::device::DeviceBatchedGemmGemm_Wmma_CShuffleV3<
+            Row, Col, Row, Row, 
+            ADataType, B0DataType, B1DataType, CDataType, AccDataType, CShuffleDataType,
+            AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+            GemmSpec,
+            64,
+            //      Gemm 0
+            32, 64, 64, 64, 64, 8, 8,
+            //      Gemm 1
+                8,  
+            16, 16,
+            // Per repeat = wave_m = wave_num, wave_n = 1
+            1, 4, 4,
+            // ABlockTransfer MK -> K0 M K1
+            S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B0BlockTransfer LK -> K0 L K1
+            S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B1BlockTransfer NL -> L0 N L1
+            S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false,
+            // CShuffleBlockTransfer MN
+            1, 1, S<1, 32, 1, 2>, 8,
+            PipeSched, PipelineVer>
+#endif
+#ifdef CK_MHA_USE_WAVE_4
+        ck::tensor_operation::device::DeviceBatchedGemmGemm_Wmma_CShuffleV3<
+            Row, Col, Row, Row, 
+            ADataType, B0DataType, B1DataType, CDataType, AccDataType, CShuffleDataType,
+            AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+            GemmSpec,
+            128,
+            //      Gemm 0
+            64, 128, 64, 64, 64, 8, 8,
+            //      Gemm 1
+                8,  
+            16, 16,
+            // Per repeat = wave_m = wave_num, wave_n = 1
+            1, 8, 4,
+            // ABlockTransfer MK -> K0 M K1
+            S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B0BlockTransfer LK -> K0 L K1
+            S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B1BlockTransfer NL -> L0 N L1
+            S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false,
+            // CShuffleBlockTransfer MN
+            1, 1, S<1, 64, 1, 2>, 8,
+            PipeSched, PipelineVer>,
+        ck::tensor_operation::device::DeviceBatchedGemmGemm_Wmma_CShuffleV3<
+            Row, Col, Row, Row, 
+            ADataType, B0DataType, B1DataType, CDataType, AccDataType, CShuffleDataType,
+            AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+            GemmSpec,
+            128,
+            //      Gemm 0
+            64, 64, 64, 64, 64, 8, 8,
+            //      Gemm 1
+                8,  
+            16, 16,
+            // Per repeat = wave_m = wave_num, wave_n = 1
+            1, 4, 4,
+            // ABlockTransfer MK -> K0 M K1
+            S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B0BlockTransfer LK -> K0 L K1
+            S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B1BlockTransfer NL -> L0 N L1
+            S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false,
+            // CShuffleBlockTransfer MN
+            1, 1, S<1, 64, 1, 2>, 8,
+            PipeSched, PipelineVer>
+#endif
+#ifdef CK_MHA_USE_WAVE_8
+        ck::tensor_operation::device::DeviceBatchedGemmGemm_Wmma_CShuffleV3<
+            Row, Col, Row, Row, 
+            ADataType, B0DataType, B1DataType, CDataType, AccDataType, CShuffleDataType,
+            AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+            GemmSpec,
+            256,
+            //      Gemm 0
+            128, 128, 64, 64, 64, 8, 8,   
+            //      Gemm 1
+                  8,  
+            16, 16,
+            // Per repeat = wave_m = wave_num, wave_n = 1
+            1, 8, 4,
+            // ABlockTransfer MK -> K0 M K1
+            S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B0BlockTransfer LK -> K0 L K1
+            S<8,  32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B1BlockTransfer NL -> L0 N L1
+            S<2,  16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false,
+            // CShuffleBlockTransfer MN
+            1, 1, S<1, 128, 1, 2>, 8,
+            PipeSched, PipelineVer>,
+        ck::tensor_operation::device::DeviceBatchedGemmGemm_Wmma_CShuffleV3<
+            Row, Col, Row, Row, 
+            ADataType, B0DataType, B1DataType, CDataType, AccDataType, CShuffleDataType,
+            AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+            GemmSpec,
+            256,
+            //      Gemm 0
+            128, 128, 64, 64, 64, 8, 8,   
+            //      Gemm 1
+                  8,  
+            16, 16,
+            // Per repeat = wave_m = wave_num, wave_n = 1
+            1, 8, 4,
+            // ABlockTransfer MK -> K0 M K1
+            S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B0BlockTransfer LK -> K0 L K1
+            S<8,  32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+            // B1BlockTransfer NL -> L0 N L1
+            S<2,  16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false,
+            // CShuffleBlockTransfer MN
+            1, 1, S<1, 128, 1, 2>, 8,
+            PipeSched, PipelineVer>
+#endif
+    >;
+#endif
+
+// clang-format on
+// Ref Gemm0
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Gemm1
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
+#include "run_batched_gemm_gemm_wmma_cshuffle_v3.inc"
+
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported() || ck::is_gfx12_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return run(argc, argv);
+}
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp
new file mode 100644
index 0000000000..d1a8e7f30c
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp
@@ -0,0 +1,37 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using B0DataType       = BF16;
+using B1DataType       = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = BF16;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+#include "batched_gemm_gemm_wmma_cshuffle_v3_base.inc"
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp
new file mode 100644
index 0000000000..57b95b7412
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp
@@ -0,0 +1,37 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+#include "batched_gemm_gemm_wmma_cshuffle_v3_base.inc"
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp
new file mode 100644
index 0000000000..82ac374efe
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = ck::f8_t;
+using B0DataType       = ck::f8_t;
+using B1DataType       = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using CDataType        = ck::f8_t;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+#include "batched_gemm_gemm_wmma_cshuffle_v3_base.inc"
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp
new file mode 100644
index 0000000000..44d63a0d43
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = int8_t;
+using B0DataType       = int8_t;
+using B1DataType       = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using CDataType        = int8_t;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+#include "batched_gemm_gemm_wmma_cshuffle_v3_base.inc"
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
index 7605d9c4f8..9afd199f24 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
@@ -84,11 +84,11 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_X
     8,           // AK1
     8,           // BK1
     2,           // B1K1
-    32,          // MPerXDL
-    32,          // NPerXDL
-    1,           // MXdlPerWave
-    4,           // NXdlPerWave
-    4,           // Gemm1NXdlPerWave
+    16,          // MPerXDL
+    16,          // NPerXDL
+    2,           // MXdlPerWave
+    8,           // NXdlPerWave
+    8,           // Gemm1NXdlPerWave
     S<4, 64, 1>, // ABlockTransfer
     S<1, 0, 2>,
     S<1, 0, 2>,
@@ -113,7 +113,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_X
     1,              // CShuffleMXdlPerWavePerShuffle
     2,              // CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+    4>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
 
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
                                                                                 B0DataType,
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
index 33ed04fb30..6a8aa8a721 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
@@ -84,11 +84,11 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_X
     8,           // AK1
     8,           // BK1
     2,           // B1K1
-    32,          // MPerXDL
-    32,          // NPerXDL
-    1,           // MXdlPerWave
-    4,           // NXdlPerWave
-    4,           // Gemm1NXdlPerWave
+    16,          // MPerXDL
+    16,          // NPerXDL
+    2,           // MXdlPerWave
+    8,           // NXdlPerWave
+    8,           // Gemm1NXdlPerWave
     S<4, 64, 1>, // ABlockTransfer
     S<1, 0, 2>,
     S<1, 0, 2>,
@@ -113,7 +113,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_X
     1,              // CShuffleMXdlPerWavePerShuffle
     2,              // CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+    4>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
 
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
                                                                                 B0DataType,
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
index e0eb193ad0..8a27fb2b3c 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
@@ -132,4 +132,11 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
 
 #include "run_batched_gemm_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1;
+}
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
index 2caee6b8dc..bf55e2fd84 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
index 40f87d1f55..e5789ba4b0 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
@@ -81,11 +81,11 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_X
     16,          // AK1
     16,          // BK1
     4,           // B1K1
-    32,          // MPerXDL
-    32,          // NPerXDL
-    1,           // MXdlPerWave
-    4,           // NXdlPerWave
-    4,           // Gemm1NXdlPerWave
+    16,          // MPerXDL
+    16,          // NPerXDL
+    2,           // MXdlPerWave
+    8,           // NXdlPerWave
+    8,           // Gemm1NXdlPerWave
     S<4, 64, 1>, // ABlockTransfer
     S<1, 0, 2>,
     S<1, 0, 2>,
@@ -110,7 +110,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_X
     1,              // CShuffleMXdlPerWavePerShuffle
     2,              // CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+    4>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
 
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
                                                                                 B0DataType,
diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
index d545508680..7a03e9cacf 100644
--- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
+++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
@@ -110,11 +110,13 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
 
         if(std::is_same<decltype(layout), Row>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
@@ -270,7 +272,18 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
         c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
 #endif
 
-        return ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result);
+        if constexpr(ck::is_same_v<CDataType, ck::half_t>)
+        {
+            return ck::utils::check_err(c_g_m_o_device_result,
+                                        c_g_m_o_host_result,
+                                        "Error: Incorrect results!",
+                                        1e-3,
+                                        1.1e-3);
+        }
+        else
+        {
+            return ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result);
+        }
     }
 
     return true;
diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_wmma_cshuffle_v3.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_wmma_cshuffle_v3.inc
new file mode 100644
index 0000000000..cea18459f4
--- /dev/null
+++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_wmma_cshuffle_v3.inc
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+int run(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape for A/B0/B1/C
+    // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o
+    ck::index_t M = 113;
+#ifdef CK_MHA_USE_RCCR_LAYOUT
+    ck::index_t N = 480; // Must be multiple of 8 even with padding.
+#else
+    ck::index_t N = 477;
+#endif
+    ck::index_t K = 200; // Must be multiple of 8 even with padding.
+    ck::index_t O = 208; // Must be multiple of 8 even with padding.
+    ck::index_t G = 91;  // Batch
+
+    float alpha = 1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+        G = std::stoi(argv[8]);
+
+        alpha = std::stof(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 8: M, N, K, O, G\n");
+        printf("arg9: scale (alpha)\n");
+        exit(0);
+    }
+
+    std::vector<ck::index_t> a_g_m_k_lengths{G, M, K};
+    std::vector<ck::index_t> a_g_m_k_strides{M * K, K, 1}; // A layout [G, M, K]
+    std::vector<ck::index_t> b0_g_n_k_lengths{G, N, K};
+    std::vector<ck::index_t> b0_g_n_k_strides{N * K, K, 1}; // B0 layout [G, N, K]
+    std::vector<ck::index_t> b1_g_o_n_lengths{G, O, N};
+#ifdef CK_MHA_USE_RCCR_LAYOUT
+    std::vector<ck::index_t> b1_g_o_n_strides{N * O, N, 1}; // B1 layout [G, O, N]
+    auto b1_layout = Row{};
+#else
+    std::vector<ck::index_t> b1_g_o_n_strides{N * O, 1, O}; // B1 layout [G, N, O]
+    auto b1_layout = Col{};
+#endif
+    std::vector<ck::index_t> c_g_m_o_lengths{G, M, O};
+    std::vector<ck::index_t> c_g_m_o_strides{M * O, O, 1}; // C layout [G, M, O]
+
+    Tensor<ADataType> a_g_m_k(a_g_m_k_lengths, a_g_m_k_strides, Row{});
+    Tensor<B0DataType> b0_g_n_k(b0_g_n_k_lengths, b0_g_n_k_strides, Row{});
+    Tensor<B1DataType> b1_g_o_n(b1_g_o_n_lengths, b1_g_o_n_strides, b1_layout);
+    Tensor<CDataType> c_g_m_o_host_result(c_g_m_o_lengths, c_g_m_o_strides, Row{});
+    Tensor<CDataType> c_g_m_o_device_result(c_g_m_o_lengths, c_g_m_o_strides, Row{});
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_n_k: " << b0_g_n_k.mDesc << std::endl;
+    std::cout << "b1_g_o_n: " << b1_g_o_n.mDesc << std::endl;
+    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+        break;
+    case 2:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        break;
+    case 3:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        break;
+    case 4: // A, B0, B1 1
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 5: // Rand: b1 b0; unit: a
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+        break;
+    case 6: // Rand: a b0 ; unit: B1
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 7: // Rand: a b1 ; unit: b0
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+        break;
+    case 8: // Rand: a ; unit: b0 b1
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 9: // Rand: b0 ; unit: a b1
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 10: // Rand: b1 ; unit: a b0
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 2>{});
+        b0_g_n_k.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_g_o_n.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_g_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_g_o_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_o_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_g_n_k.mData.data());
+    b1_device_buf.ToDevice(b1_g_o_n.mData.data());
+
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    // do GEMM
+    float best_perf         = .0;
+    float best_time         = .0;
+    int not_pass            = 0;
+    std::string best_kernel = "";
+    printf("Verification: %s\n", do_verification ? "ON" : "OFF");
+
+    ck::static_for<0, std::tuple_size_v<DeviceMHAFactory>, 1>{}([&](auto i) -> void {
+        const auto device_mha_instance = std::get<i>(DeviceMHAFactory{});
+
+        using DeviceMHAInstance = ck::remove_cvref_t<decltype(device_mha_instance)>;
+        auto gemm               = DeviceMHAInstance{};
+        auto invoker_ptr        = gemm.MakeInvokerPointer();
+        auto argument_ptr =
+            gemm.MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                     static_cast<B0DataType*>(b0_device_buf.GetDeviceBuffer()),
+                                     static_cast<B1DataType*>(b1_device_buf.GetDeviceBuffer()),
+                                     static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                     M,
+                                     N,
+                                     K,
+                                     O,
+                                     G,                   // Batch,
+                                     a_g_m_k_strides[1],  // StrideA,
+                                     b0_g_n_k_strides[1], // StrideB0,
+#ifdef CK_MHA_USE_RCCR_LAYOUT
+                                     b1_g_o_n_strides[1], // StrideB1,
+#else
+                                     b1_g_o_n_strides[2], // StrideB1,
+#endif
+                                     c_g_m_o_strides[1],  // StrideC,
+                                     a_g_m_k_strides[0],  // BatchStrideA
+                                     b0_g_n_k_strides[0], // BatchStrideB0
+                                     b1_g_o_n_strides[0], // BatchStrideB1
+                                     c_g_m_o_strides[0],  // BatchStrideC
+                                     a_element_op,
+                                     b0_element_op,
+                                     acc0_element_op,
+                                     b1_element_op,
+                                     c_element_op);
+
+        if(!gemm.IsSupportedArgument(argument_ptr.get()))
+        {
+            std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+            return;
+        }
+
+        float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G;
+        std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                                 sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                                G;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+        if(tflops > best_perf)
+        {
+            best_perf   = tflops;
+            best_time   = ave_time * 1000;
+            best_kernel = gemm.GetTypeString();
+        }
+        if(do_verification)
+        {
+            c_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
+
+            Tensor<B0DataType> b0_g_k_n({G, K, N});
+            Tensor<B1DataType> b1_g_n_o({G, N, O});
+            Tensor<AccDataType> acc0_g_m_n({G, M, N}); // scratch object after gemm0
+            Tensor<ADataType> a1_g_m_n({G, M, N});     // scratch object after conversion
+
+            // permute
+            b0_g_n_k.ForEach(
+                [&](auto& self, auto idx) { b0_g_k_n(idx[0], idx[2], idx[1]) = self(idx); });
+            b1_g_o_n.ForEach(
+                [&](auto& self, auto idx) { b1_g_n_o(idx[0], idx[2], idx[1]) = self(idx); });
+
+            // gemm 0
+            auto ref_gemm0          = ReferenceGemm0Instance{};
+            auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+            auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+                a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op);
+
+            ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+            acc0_g_m_n.ForEach([&](auto& self, auto idx) {
+                // Passthrough instead of softmax, DOES involve data type conversion.
+                a1_g_m_n(idx) = ck::type_convert<ADataType, AccDataType>(self(idx));
+            });
+
+            // gemm1
+            auto ref_gemm1          = ReferenceGemm1Instance{};
+            auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+            auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n,
+                                                             b1_g_n_o,
+                                                             c_g_m_o_host_result,
+                                                             PassThrough{},
+                                                             b1_element_op,
+                                                             c_element_op);
+
+            ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+            // default absolute error and relative error is 0.001
+            double rtol = 1e-3;
+            double atol = 1e-3;
+
+            // when BF16 is taken, set absolute error and relative error to 0.01
+            if(std::is_same_v<ADataType, ck::bhalf_t> && std::is_same_v<B0DataType, ck::bhalf_t> &&
+               std::is_same_v<B1DataType, ck::bhalf_t> && std::is_same_v<CDataType, ck::bhalf_t>)
+            {
+                rtol = 1e-2;
+                atol = 1e-2;
+            }
+
+            bool this_run_verification = ck::utils::check_err(c_g_m_o_device_result.mData,
+                                                              c_g_m_o_host_result.mData,
+                                                              "Error: Incorrect results!",
+                                                              rtol,
+                                                              atol);
+            printf("Verification: %s, Pass: %s\n",
+                   do_verification ? "ON" : "OFF",
+                   this_run_verification ? "YES" : "NO");
+
+            if(!this_run_verification)
+            {
+                not_pass = 1;
+                printf("%d th MHA instance verification Failed \n", i.value);
+            }
+        }
+    });
+    std::cout << "---------------------------------------------------------------------------------"
+                 "-----------"
+              << std::endl;
+    std::cout << "Problem Size: G: " << G << ", M: " << M << ", N: " << N << ", K: " << K
+              << ", O: " << O << std::endl;
+    std::cout << "---------------------------------------------------------------------------------"
+                 "-----------"
+              << std::endl;
+    std::cout << "Best kernel: " << best_kernel << " , " << best_perf << " TFlops , " << best_time
+              << " us" << std::endl;
+    std::cout << "---------------------------------------------------------------------------------"
+                 "-----------"
+              << std::endl;
+    return not_pass;
+}
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
index 1d1566d575..2604a50a76 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
@@ -101,11 +101,11 @@ using DeviceGemmInstance =
         8,           // AK1
         8,           // BK1
         2,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        2,           // Gemm1NXdlPerWave
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
         S<4, 64, 1>, // ABlockTransfer
         S<1, 0, 2>,
         S<1, 0, 2>,
@@ -130,7 +130,7 @@ using DeviceGemmInstance =
         1,              // CShuffleMXdlPerWavePerShuffle
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
         MaskingSpec>;   // MaskingSpecialization
 
 // Ref Gemm0: fp16 in, fp32 out
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
index bae88d4b8e..331bfe99c2 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
@@ -100,11 +100,11 @@ using DeviceGemmInstance =
         8,           // AK1
         8,           // BK1
         2,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        2,           // Gemm1NXdlPerWave
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
         S<4, 64, 1>, // ABlockTransfer
         S<1, 0, 2>,
         S<1, 0, 2>,
@@ -129,7 +129,7 @@ using DeviceGemmInstance =
         1,              // CShuffleMXdlPerWavePerShuffle
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
         MaskingSpec>;   // MaskingSpecialization
 
 // Ref Gemm0: bf16 in, fp32 out
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
index a098ce6675..cd321c0da3 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
@@ -101,11 +101,11 @@ using DeviceGemmInstance =
         8,           // AK1
         8,           // BK1
         2,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        2,           // Gemm1NXdlPerWave
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
         S<4, 64, 1>, // ABlockTransfer
         S<1, 0, 2>,
         S<1, 0, 2>,
@@ -130,7 +130,7 @@ using DeviceGemmInstance =
         1,              // CShuffleMXdlPerWavePerShuffle
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
         MaskingSpec>;   // MaskingSpecialization
 
 // Ref Gemm0: fp16 in, fp32 out
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
index ce8caf7588..f30ec3fd03 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
@@ -84,11 +84,11 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
     8,           // AK1
     8,           // BK1
     2,           // B1K1
-    32,          // MPerXDL
-    32,          // NPerXDL
-    1,           // MXdlPerWave
-    4,           // NXdlPerWave
-    2,           // Gemm1NXdlPerWave
+    16,          // MPerXDL
+    16,          // NPerXDL
+    2,           // MXdlPerWave
+    8,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
     S<4, 64, 1>, // ABlockTransfer
     S<1, 0, 2>,
     S<1, 0, 2>,
@@ -113,7 +113,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
     1,              // CShuffleMXdlPerWavePerShuffle
     2,              // CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+    4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
     false>;
 
 // Ref Gemm0: fp16 in, fp32 out
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
index 138db14963..e403ba7f66 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
@@ -85,11 +85,11 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
     8,           // AK1
     8,           // BK1
     2,           // B1K1
-    32,          // MPerXDL
-    32,          // NPerXDL
-    1,           // MXdlPerWave
-    4,           // NXdlPerWave
-    2,           // Gemm1NXdlPerWave
+    16,          // MPerXDL
+    16,          // NPerXDL
+    2,           // MXdlPerWave
+    8,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
     S<4, 64, 1>, // ABlockTransfer
     S<1, 0, 2>,
     S<1, 0, 2>,
@@ -114,7 +114,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
     1,              // CShuffleMXdlPerWavePerShuffle
     2,              // CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+    4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
     false>;
 
 // Ref Gemm0: fp16 in, fp32 out
diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
index 5794924294..7738a6b6d4 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
@@ -100,11 +100,11 @@ using DeviceGemmInstance =
         8,           // AK1
         8,           // BK1
         2,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        2,           // Gemm1NXdlPerWave
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
         S<4, 64, 1>, // ABlockTransfer
         S<1, 0, 2>,
         S<1, 0, 2>,
@@ -129,7 +129,7 @@ using DeviceGemmInstance =
         1,              // CShuffleMXdlPerWavePerShuffle
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
         MaskingSpec>;   // MaskingSpecialization
 
 // Ref Gemm0: fp16 in, fp32 out
diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
index 97caec6053..b59498829e 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
@@ -101,11 +101,11 @@ using DeviceGemmInstance =
         8,           // AK1
         8,           // BK1
         2,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        2,           // Gemm1NXdlPerWave
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
         S<4, 64, 1>, // ABlockTransfer
         S<1, 0, 2>,
         S<1, 0, 2>,
@@ -130,7 +130,7 @@ using DeviceGemmInstance =
         1,              // CShuffleMXdlPerWavePerShuffle
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
         MaskingSpec>;   // MaskingSpecialization
 
 // Ref Gemm0: fp16 in, fp32 out
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
index 1514fc48b3..aa2a6b3b42 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
@@ -111,12 +111,14 @@ int run(int argc, char* argv[])
         if(std::is_same<decltype(layout), Row>::value)
         {
             return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
-                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+                                        std::vector<std::size_t>({batch_stride, stride, 1}),
+                                        layout);
         }
         else
         {
             return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
-                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+                                        std::vector<std::size_t>({batch_stride, 1, stride}),
+                                        layout);
         }
     };
 
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
index 2b02069e65..6175f0b5be 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 int run(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -88,11 +90,11 @@ int run(int argc, char* argv[])
             ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
             : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
-    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Bypass{});
+    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, Bypass{});
+    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, Bypass{});
+    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
+    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc
index e0ccb6dad1..db13e3b963 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc
@@ -1,6 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 int run(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -88,11 +92,30 @@ int run(int argc, char* argv[])
             ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
             : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
-    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
+                                       std::vector<ck::index_t> strides,
+                                       bool permute,
+                                       auto layout) {
+        if(permute)
+        {
+            return HostTensorDescriptor(lens, strides, Bypass{});
+        }
+        else
+        {
+            return HostTensorDescriptor(lens, strides, layout);
+        }
+    };
+
+    Tensor<ADataType> a_gs_ms_ks(
+        f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
+    Tensor<B0DataType> b0_gs_ns_ks(
+        f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
+    Tensor<B1DataType> b1_gs_os_ns(
+        f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
+    Tensor<CDataType> c_gs_ms_os_host_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
+    Tensor<CDataType> c_gs_ms_os_device_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc
index 0ad031cc71..1e4b52d4cf 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc
@@ -1,6 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 int run(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -113,11 +117,30 @@ int run(int argc, char* argv[])
                              head_dim,
                              1}; // C layout [batch_size, head_num, q_sequence_length, head_dim]
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
-    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
+                                       std::vector<ck::index_t> strides,
+                                       bool permute,
+                                       auto layout) {
+        if(permute)
+        {
+            return HostTensorDescriptor(lens, strides, Bypass{});
+        }
+        else
+        {
+            return HostTensorDescriptor(lens, strides, layout);
+        }
+    };
+
+    Tensor<ADataType> a_gs_ms_ks(
+        f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
+    Tensor<B0DataType> b0_gs_ns_ks(
+        f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
+    Tensor<B1DataType> b1_gs_os_ns(
+        f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
+    Tensor<CDataType> c_gs_ms_os_host_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
+    Tensor<CDataType> c_gs_ms_os_device_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
@@ -191,7 +214,7 @@ int run(int argc, char* argv[])
         head_num * 2 * head_dim,
         head_dim,
         1}; // kv layout [batch_size, q_sequence_length, head_num, 2, head_dim]
-    Tensor<ADataType> kv_gs_ns_ks(kv_gs_ns_ks_lengths, kv_gs_ns_ks_strides);
+    Tensor<ADataType> kv_gs_ns_ks(kv_gs_ns_ks_lengths, kv_gs_ns_ks_strides, Bypass{});
     // merge kv into a packed pointer send to device
     b0_gs_ns_ks.ForEach(
         [&](auto& self, auto idx) { kv_gs_ns_ks(idx[0], idx[1], idx[2], 0, idx[3]) = self(idx); });
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
index c693995140..874d987a1d 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
@@ -1,6 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 int run(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -63,6 +67,19 @@ int run(int argc, char* argv[])
 
     std::size_t flop = 0, num_byte = 0;
 
+    auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
+                                       std::vector<ck::index_t> strides,
+                                       bool permute,
+                                       auto layout) {
+        if(permute)
+        {
+            return HostTensorDescriptor(lens, strides, Bypass{});
+        }
+        else
+        {
+            return HostTensorDescriptor(lens, strides, layout);
+        }
+    };
     std::cout << "group count " << group_count << ". printing first 4 groups\n";
     for(std::size_t i = 0; i < group_count; i++)
     {
@@ -113,10 +130,14 @@ int run(int argc, char* argv[])
                                  {}}); // acc1_biases_gs_ms_os_strides
 
         // C_m_o = A_m_k * B0_k_n * B1_n_o
-        Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-        Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-        Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-        Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+        Tensor<ADataType> a_gs_ms_ks(
+            f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
+        Tensor<B0DataType> b0_gs_ns_ks(f_host_tensor_descriptor(
+            b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
+        Tensor<B1DataType> b1_gs_os_ns(f_host_tensor_descriptor(
+            b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
+        Tensor<CDataType> c_gs_ms_os_device_result(f_host_tensor_descriptor(
+            c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
 
         int Batch = G0 * G1;
         flop += (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * Batch;
@@ -252,7 +273,8 @@ int run(int argc, char* argv[])
             Tensor<AccDataType> acc0_g_m_n({G0 * G1, M, N});        // scratch object after gemm0
             Tensor<ADataType> a1_g_m_n({G0 * G1, M, N});            // scratch object after softmax
             Tensor<CDataType> c_g_m_o_host_result({G0 * G1, M, O}); // scratch object after gemm1
-            Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+            Tensor<CDataType> c_gs_ms_os_host_result(f_host_tensor_descriptor(
+                c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
 
             // permute
             a_gs_ms_ks.ForEach([&](auto& self, auto idx) {
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc
index 7ac29f33ca..1c2a26d916 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc
@@ -1,6 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 int run(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -91,11 +95,30 @@ int run(int argc, char* argv[])
             ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
             : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
-    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
+                                       std::vector<ck::index_t> strides,
+                                       bool permute,
+                                       auto layout) {
+        if(permute)
+        {
+            return HostTensorDescriptor(lens, strides, Bypass{});
+        }
+        else
+        {
+            return HostTensorDescriptor(lens, strides, layout);
+        }
+    };
+
+    Tensor<ADataType> a_gs_ms_ks(
+        f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
+    Tensor<B0DataType> b0_gs_ns_ks(
+        f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
+    Tensor<B1DataType> b1_gs_os_ns(
+        f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
+    Tensor<CDataType> c_gs_ms_os_host_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
+    Tensor<CDataType> c_gs_ms_os_device_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc
index fb9b1b0bd7..76f3ee756c 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc
@@ -1,6 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 int run(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -91,11 +95,30 @@ int run(int argc, char* argv[])
             ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
             : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
-    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
+                                       std::vector<ck::index_t> strides,
+                                       bool permute,
+                                       auto layout) {
+        if(permute)
+        {
+            return HostTensorDescriptor(lens, strides, Bypass{});
+        }
+        else
+        {
+            return HostTensorDescriptor(lens, strides, layout);
+        }
+    };
+
+    Tensor<ADataType> a_gs_ms_ks(
+        f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
+    Tensor<B0DataType> b0_gs_ns_ks(
+        f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
+    Tensor<B1DataType> b1_gs_os_ns(
+        f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
+    Tensor<CDataType> c_gs_ms_os_host_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
+    Tensor<CDataType> c_gs_ms_os_device_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc
index 2cb69380e5..86754927ed 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc
@@ -1,6 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 int run(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -108,11 +112,30 @@ int run(int argc, char* argv[])
                              head_dim,
                              1}; // C layout [batch_size, head_num, sequence_length, head_dim]
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
-    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
+                                       std::vector<ck::index_t> strides,
+                                       bool permute,
+                                       auto layout) {
+        if(permute)
+        {
+            return HostTensorDescriptor(lens, strides, Bypass{});
+        }
+        else
+        {
+            return HostTensorDescriptor(lens, strides, layout);
+        }
+    };
+
+    Tensor<ADataType> a_gs_ms_ks(
+        f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
+    Tensor<B0DataType> b0_gs_ns_ks(
+        f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
+    Tensor<B1DataType> b1_gs_os_ns(
+        f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
+    Tensor<CDataType> c_gs_ms_os_host_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
+    Tensor<CDataType> c_gs_ms_os_device_result(
+        f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
@@ -186,7 +209,7 @@ int run(int argc, char* argv[])
         head_num * 3 * head_dim,
         head_dim,
         1}; // qkv layout [batch_size, sequence_length, head_num, 3, head_dim]
-    Tensor<ADataType> qkv_gs_ms_ks(qkv_gs_ms_ks_lengths, qkv_gs_ms_ks_strides);
+    Tensor<ADataType> qkv_gs_ms_ks(qkv_gs_ms_ks_lengths, qkv_gs_ms_ks_strides, Bypass{});
     // merge qkv into a packed pointer send to device
     a_gs_ms_ks.ForEach(
         [&](auto& self, auto idx) { qkv_gs_ms_ks(idx[0], idx[1], idx[2], 0, idx[3]) = self(idx); });
diff --git a/example/35_splitK_gemm/CMakeLists.txt b/example/35_splitK_gemm/CMakeLists.txt
index 904006ba36..e0476bfaad 100644
--- a/example/35_splitK_gemm/CMakeLists.txt
+++ b/example/35_splitK_gemm/CMakeLists.txt
@@ -27,3 +27,16 @@ add_example_executable(example_gemm_xdl_splitk_reduce_multi_d_bf16 gemm_xdl_spli
 add_example_executable(example_gemm_xdl_splitk_reduce_bf16A_i8B gemm_xdl_splitk_reduce_bf16A_i8B.cpp)
 
 add_example_executable(example_gemm_xdl_splitk_reduce_bfp16 gemm_xdl_splitk_reduce_bf16.cpp)
+
+add_custom_target(example_splitK_gemm_wmma)
+add_example_executable(example_gemm_wmma_splitk_reduce_bf16 gemm_wmma_splitk_reduce_bf16.cpp)
+add_example_dependencies(example_splitK_gemm_wmma example_gemm_wmma_splitk_reduce_bf16)
+
+add_example_executable(example_gemm_wmma_splitk_reduce_bf16A_i8B gemm_wmma_splitk_reduce_bf16A_i8B.cpp)
+add_example_dependencies(example_splitK_gemm_wmma example_gemm_wmma_splitk_reduce_bf16A_i8B)
+
+add_example_executable(example_gemm_wmma_splitk_reduce_multi_d_bf16 gemm_wmma_splitk_reduce_multi_d_bf16.cpp)
+add_example_dependencies(example_splitK_gemm_wmma example_gemm_wmma_splitk_reduce_multi_d_bf16)
+
+add_example_executable(example_gemm_wmma_splitk_reduce_multi_d_fp16 gemm_wmma_splitk_reduce_multi_d_fp16.cpp)
+add_example_dependencies(example_splitK_gemm_wmma example_gemm_wmma_splitk_reduce_multi_d_fp16)
diff --git a/example/35_splitK_gemm/common.hpp b/example/35_splitK_gemm/common.hpp
index 64fadae9e5..325cc37731 100644
--- a/example/35_splitK_gemm/common.hpp
+++ b/example/35_splitK_gemm/common.hpp
@@ -99,3 +99,85 @@ bool parse_cmd_args(int argc,
 
     return true;
 }
+
+template <typename DataType>
+inline __host__ __device__ constexpr double get_rtol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 1e-1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 1.5e-1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
+
+template <typename DataType>
+inline __host__ __device__ constexpr double get_atol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 16.1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 8192.1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
diff --git a/example/35_splitK_gemm/gemm_wmma_splitk_reduce_bf16.cpp b/example/35_splitK_gemm/gemm_wmma_splitk_reduce_bf16.cpp
new file mode 100644
index 0000000000..b481483d42
--- /dev/null
+++ b/example/35_splitK_gemm/gemm_wmma_splitk_reduce_bf16.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp"
+
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::bhalf_t;
+using CDataType        = ck::bhalf_t;
+using ReduceDataType   = ck::bhalf_t;
+using D0DataType       = ck::bhalf_t;
+using DsDataType       = ck::Tuple<>;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using CLayout  = Row;
+using D0Layout = CLayout;
+using DsLayout = ck::Tuple<>;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// clang-format off
+using DeviceWmmaGemmInstance = 
+    ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3R1<
+        ALayout,   BLayout,   DsLayout,  CLayout,
+        ADataType,   BDataType, DsDataType,  CDataType, AccDataType,  CShuffleDataType,
+        AElementOp, BElementOp, CDEElementOp, GemmDefault,
+        256,
+        128, 128, 32,
+        8, 8,
+        16, 16,
+        4, 2,
+        S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>,
+        1, 1, 8, true,
+        S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>,
+        1, 1, 8, true,
+        1, 1, S<1, 32, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Intrawave,
+        ck::BlockGemmPipelineVersion::v1, ReduceDataType>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_wmma_splitk_reduce_example.inc"
+
+int main(int argc, char* argv[]) { return !run_wmma_gemm_splitk_example(argc, argv); }
diff --git a/example/35_splitK_gemm/gemm_wmma_splitk_reduce_bf16A_i8B.cpp b/example/35_splitK_gemm/gemm_wmma_splitk_reduce_bf16A_i8B.cpp
new file mode 100644
index 0000000000..dcf4a1652d
--- /dev/null
+++ b/example/35_splitK_gemm/gemm_wmma_splitk_reduce_bf16A_i8B.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp"
+
+using ADataType        = ck::bhalf_t;
+using BDataType        = int8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::bhalf_t;
+using CDataType        = ck::bhalf_t;
+using ReduceDataType   = float;
+using D0DataType       = ck::bhalf_t;
+using DsDataType       = ck::Tuple<>;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using CLayout  = Row;
+using D0Layout = Row;
+using DsLayout = ck::Tuple<>;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// clang-format off
+using DeviceWmmaGemmInstance = 
+    ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3R1<
+        ALayout,   BLayout,   DsLayout,  CLayout,
+        ADataType,   BDataType, DsDataType,  CDataType, AccDataType,  CShuffleDataType,
+        AElementOp, BElementOp, CDEElementOp, GemmDefault,
+        256,   
+        128,  128,  32,
+        8,    8,
+        16,   16,
+        4,    2,
+        S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,
+        1,    1,    8,   true,
+        S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,
+        1,    1,    8,   true,
+        1,    1,    S<1, 32, 1, 8>,  8,
+        ck::BlockGemmPipelineScheduler::Intrawave,
+        ck::BlockGemmPipelineVersion::v1, ReduceDataType>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_wmma_splitk_reduce_example.inc"
+
+int main(int argc, char* argv[]) { return !run_wmma_gemm_splitk_example(argc, argv); }
diff --git a/example/35_splitK_gemm/gemm_wmma_splitk_reduce_multi_d_bf16.cpp b/example/35_splitK_gemm/gemm_wmma_splitk_reduce_multi_d_bf16.cpp
new file mode 100644
index 0000000000..dab308d148
--- /dev/null
+++ b/example/35_splitK_gemm/gemm_wmma_splitk_reduce_multi_d_bf16.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp"
+
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::bhalf_t;
+using CDataType        = ck::bhalf_t;
+using ReduceDataType   = float;
+using D0DataType       = ck::bhalf_t;
+using DsDataType       = ck::Tuple<D0DataType>;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using CLayout  = Row;
+using D0Layout = CLayout;
+using DsLayout = ck::Tuple<D0Layout>;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// clang-format off
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3R1<
+        ALayout,   BLayout,   DsLayout,  CLayout,
+        ADataType,   BDataType, DsDataType,  CDataType, AccDataType,  CShuffleDataType,
+        AElementOp, BElementOp, CDEElementOp, GemmDefault,
+        256,
+        128,  128,  32,
+        8,    8,
+        16,   16,
+        4,    2,
+        S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,
+        1,    1,    8,   true,
+        S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,
+        1,    1,    8,   true,
+        1,    1,    S<1, 32, 1, 8>,  8,
+        ck::BlockGemmPipelineScheduler::Intrawave,
+        ck::BlockGemmPipelineVersion::v1, ReduceDataType>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_wmma_splitk_reduce_multi_d_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_multi_d_example(argc, argv); }
diff --git a/example/35_splitK_gemm/gemm_wmma_splitk_reduce_multi_d_fp16.cpp b/example/35_splitK_gemm/gemm_wmma_splitk_reduce_multi_d_fp16.cpp
new file mode 100644
index 0000000000..489816559d
--- /dev/null
+++ b/example/35_splitK_gemm/gemm_wmma_splitk_reduce_multi_d_fp16.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+using ReduceDataType   = float;
+using D0DataType       = ck::half_t;
+using DsDataType       = ck::Tuple<D0DataType>;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using CLayout  = Row;
+using D0Layout = CLayout;
+using DsLayout = ck::Tuple<D0Layout>;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// clang-format off
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3R1<
+        ALayout,   BLayout,   DsLayout,  CLayout,
+        ADataType,   BDataType, DsDataType,  CDataType, AccDataType,  CShuffleDataType,
+        AElementOp, BElementOp, CDEElementOp, GemmDefault,
+        256,
+        128,  256,  64,
+        8,    8,
+        16,   16,
+        4,    4,
+        S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,
+        1,    1,    8,   true,
+        S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,
+        1,    1,    8,   true,
+        1,    1,    S<1, 32, 1, 8>,  8,
+        ck::BlockGemmPipelineScheduler::Intrawave,
+        ck::BlockGemmPipelineVersion::v1, ReduceDataType>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_wmma_splitk_reduce_multi_d_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_multi_d_example(argc, argv); }
diff --git a/example/35_splitK_gemm/gemm_xdl_splitk_reduce_bf16.cpp b/example/35_splitK_gemm/gemm_xdl_splitk_reduce_bf16.cpp
index 7ceb1d09ef..1843198933 100644
--- a/example/35_splitK_gemm/gemm_xdl_splitk_reduce_bf16.cpp
+++ b/example/35_splitK_gemm/gemm_xdl_splitk_reduce_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -35,13 +35,13 @@ using DeviceGemmV2Instance =
         256,
         128,  128,  64,
         8,    4,
-        32,   32,
-        2,    2,
+        16,   16,
+        4,    4,
         S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,
         2,    8,    8,   0,
         S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,
         1,    8,    4,   0,
-        1,    1,    S<1, 32, 1, 8>,  8,
+        1,    1,    S<1, 32, 1, 8>,  4,
         ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
 // clang-format on
 
diff --git a/example/35_splitK_gemm/gemm_xdl_splitk_reduce_bf16A_i8B.cpp b/example/35_splitK_gemm/gemm_xdl_splitk_reduce_bf16A_i8B.cpp
index b5aeff65d6..1e4398b9f6 100644
--- a/example/35_splitK_gemm/gemm_xdl_splitk_reduce_bf16A_i8B.cpp
+++ b/example/35_splitK_gemm/gemm_xdl_splitk_reduce_bf16A_i8B.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -35,13 +35,13 @@ using DeviceGemmV2Instance =
         256,   
         128,  128,  64,
         8,    4,
-        32,   32,
-        2,    2,
+        16,   16,
+        4,    4,
         S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,
         2,    8,    8,   0,
         S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,
         1,    8,    4,   0,
-        1,    1,    S<1, 32, 1, 8>,  8,
+        1,    1,    S<1, 32, 1, 8>,  4,
         ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ReduceDataType>;
 // clang-format on
 
diff --git a/example/35_splitK_gemm/gemm_xdl_splitk_reduce_multi_d_bf16.cpp b/example/35_splitK_gemm/gemm_xdl_splitk_reduce_multi_d_bf16.cpp
index cb84f2a416..d5acde139a 100644
--- a/example/35_splitK_gemm/gemm_xdl_splitk_reduce_multi_d_bf16.cpp
+++ b/example/35_splitK_gemm/gemm_xdl_splitk_reduce_multi_d_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -35,13 +35,13 @@ using DeviceGemmV2Instance =
         256,
         128,  128,  64,
         8,    4,
-        32,   32,
-        2,    2,
+        16,   16,
+        4,    4,
         S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,
         2,    8,    8,   0,
         S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,
         1,    8,    4,   0,
-        1,    1,    S<1, 32, 1, 8>,  8,
+        1,    1,    S<1, 32, 1, 8>,  4,
         ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ReduceDataType>;
 // clang-format on
 
diff --git a/example/35_splitK_gemm/gemm_xdl_splitk_reduce_multi_d_fp16.cpp b/example/35_splitK_gemm/gemm_xdl_splitk_reduce_multi_d_fp16.cpp
index 2ab8f77dc4..bb3c23f060 100644
--- a/example/35_splitK_gemm/gemm_xdl_splitk_reduce_multi_d_fp16.cpp
+++ b/example/35_splitK_gemm/gemm_xdl_splitk_reduce_multi_d_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -35,13 +35,13 @@ using DeviceGemmV2Instance =
         256,
         128,  128,  64,
         8,    4,
-        32,   32,
-        2,    2,
+        16,   16,
+        4,    4,
         S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,
         2,    8,    8,   0,
         S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,
         1,    8,    4,   0,
-        1,    1,    S<1, 32, 1, 8>,  8,
+        1,    1,    S<1, 32, 1, 8>,  4,
         ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v2, ReduceDataType>;
 // clang-format on
 
diff --git a/example/35_splitK_gemm/run_gemm_splitk_reduce_multi_d_example.inc b/example/35_splitK_gemm/run_gemm_splitk_reduce_multi_d_example.inc
index 9635993d63..0b060841bf 100644
--- a/example/35_splitK_gemm/run_gemm_splitk_reduce_multi_d_example.inc
+++ b/example/35_splitK_gemm/run_gemm_splitk_reduce_multi_d_example.inc
@@ -3,88 +3,6 @@
 
 #pragma once
 
-template <typename DataType>
-inline __host__ __device__ constexpr double get_rtol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 1e-1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 1.5e-1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-
-template <typename DataType>
-inline __host__ __device__ constexpr double get_atol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 16.1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 8192.1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 {
diff --git a/example/35_splitK_gemm/run_gemm_wmma_splitk_reduce_example.inc b/example/35_splitK_gemm/run_gemm_wmma_splitk_reduce_example.inc
new file mode 100644
index 0000000000..25628ef770
--- /dev/null
+++ b/example/35_splitK_gemm/run_gemm_wmma_splitk_reduce_example.inc
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <typename ProblemType>
+bool run_wmma_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(stride == 0)
+            {
+                // give a chance if stride is zero, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return col;
+                }
+                else
+                {
+                    return row;
+                }
+            }
+            else
+                return stride;
+        };
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "init method: " << config.init_method << std::endl;
+    std::cout << "KBatch: " << KBatch << std::endl;
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // device GEMM
+    auto device_op = DeviceWmmaGemmInstance{};
+    auto invoker   = device_op.MakeInvoker();
+
+    auto argument =
+        device_op.MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                      std::array<const void*, 0>{}, // empty D tensors
+                                      static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      std::array<ck::index_t, 0>{}, // empty D strides
+                                      StrideC,
+                                      KBatch,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    // Allocate workspace for split-K reduction if needed
+    size_t workspace_size = device_op.GetWorkSpaceSize(argument.get());
+    DeviceMem workspace_buf(workspace_size);
+    std::cout << "Workspace size: " << workspace_size << " bytes" << std::endl;
+    if(workspace_size > 0)
+    {
+        argument->p_workspace_ = workspace_buf.GetDeviceBuffer();
+        std::cout << "Allocated workspace of size: " << workspace_size << " bytes" << std::endl;
+    }
+
+    if(!device_op.IsSupportedArgument(argument.get()))
+    {
+        std::cout << "The runtime argument is not supported!" << std::endl;
+        std::cout << "Debug info:" << std::endl;
+        std::cout << "  M=" << M << ", N=" << N << ", K=" << K << ", KBatch=" << KBatch
+                  << std::endl;
+        std::cout << "  StrideA=" << StrideA << ", StrideB=" << StrideB << ", StrideC=" << StrideC
+                  << std::endl;
+        return false;
+    }
+
+    bool pass      = true;
+    float ave_time = 0;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument.get(), StreamConfig{nullptr, false});
+
+        c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass = ck::utils::check_err(c_m_n_device_result.mData,
+                                    c_m_n_host_result.mData,
+                                    "Error: Incorrect results!",
+                                    get_rtol<CDataType>(),
+                                    get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time = invoker.Run(argument.get(), StreamConfig{nullptr, config.time_kernel});
+
+        std::size_t flop = std::size_t(2) * M * N * K;
+
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E12 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E9 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << device_op.GetTypeString() << std::endl;
+    }
+
+    return pass;
+}
+
+bool run_wmma_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_wmma_gemm(problem_size, config);
+}
diff --git a/example/35_splitK_gemm/run_gemm_wmma_splitk_reduce_multi_d_example.inc b/example/35_splitK_gemm/run_gemm_wmma_splitk_reduce_multi_d_example.inc
new file mode 100644
index 0000000000..59996655c6
--- /dev/null
+++ b/example/35_splitK_gemm/run_gemm_wmma_splitk_reduce_multi_d_example.inc
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <typename ProblemSize>
+bool run_wmma_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M        = problem_size.M;
+    auto N        = problem_size.N;
+    auto K        = problem_size.K;
+    auto StrideA  = problem_size.StrideA;
+    auto StrideB  = problem_size.StrideB;
+    auto StrideC  = problem_size.StrideC;
+    auto StrideD0 = problem_size.StrideC;
+    auto KBatch   = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(stride == 0)
+            {
+                // give a chance if stride is zero, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return col;
+                }
+                else
+                {
+                    return row;
+                }
+            }
+            else
+                return stride;
+        };
+
+    StrideA  = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB  = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC  = f_get_default_stride(M, N, StrideC, CLayout{});
+    StrideD0 = f_get_default_stride(M, N, StrideD0, D0Layout{});
+
+    Tensor<ADataType> a_m_k(
+        f_host_tensor_descriptor(problem_size.M, problem_size.K, problem_size.StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(
+        f_host_tensor_descriptor(problem_size.K, problem_size.N, problem_size.StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(
+        f_host_tensor_descriptor(problem_size.M, problem_size.N, problem_size.StrideC, D0Layout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(
+        f_host_tensor_descriptor(problem_size.M, problem_size.N, problem_size.StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(
+        f_host_tensor_descriptor(problem_size.M, problem_size.N, problem_size.StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "init method: " << config.init_method << std::endl;
+    std::cout << "KBatch: " << KBatch << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto gemm                                              = DeviceGemmV2Instance{};
+    auto invoker                                           = gemm.MakeInvoker();
+    constexpr auto kNum_DTensors                           = DsDataType::Size();
+    const std::array<const void*, kNum_DTensors> p_ds      = {d0_m_n_device_buf.GetDeviceBuffer()};
+    const std::array<ck::index_t, kNum_DTensors> d_strides = {problem_size.StrideC};
+
+    auto argument =
+        gemm.MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                 static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                 p_ds,
+                                 static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                 problem_size.M,
+                                 problem_size.N,
+                                 problem_size.K,
+                                 problem_size.StrideA,
+                                 problem_size.StrideB,
+                                 d_strides,
+                                 problem_size.StrideC,
+                                 problem_size.KBatch,
+                                 a_element_op,
+                                 b_element_op,
+                                 c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument.get()))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+        return false;
+    }
+
+    auto workspace_size = gemm.GetWorkSpaceSize(argument.get());
+    DeviceMem workspace_device_buf(workspace_size);
+
+    std::cout << "Workspace size: " << workspace_size << " bytes" << std::endl;
+    std::cout << "Allocated workspace of size: " << workspace_size << " bytes" << std::endl;
+
+    if(workspace_size > 0)
+    {
+        argument->p_workspace_ = workspace_device_buf.GetDeviceBuffer();
+    }
+
+    if(config.do_verification)
+    {
+        using ReferenceGemmInstanceMultiD = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                      BDataType,
+                                                                                      CDataType,
+                                                                                      AccDataType,
+                                                                                      AElementOp,
+                                                                                      BElementOp,
+                                                                                      PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstanceMultiD{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_host_result.ForEach(
+            [&](auto& self, auto idx) { c_element_op(self(idx), self(idx), d0_m_n(idx)); });
+    }
+
+    std::cout << "init method: " << config.init_method << std::endl;
+    std::cout << "KBatch: " << problem_size.KBatch << std::endl;
+
+    float ave_time = invoker.Run(argument.get(), StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop      = std::size_t(2) * problem_size.M * problem_size.N * problem_size.K;
+    std::size_t num_btype = sizeof(ADataType) * problem_size.M * problem_size.K +
+                            sizeof(BDataType) * problem_size.K * problem_size.N +
+                            sizeof(CDataType) * problem_size.M * problem_size.N +
+                            sizeof(D0DataType) * problem_size.M * problem_size.N;
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        double rtol = get_rtol<CDataType>();
+        double atol = get_atol<CDataType>();
+
+        return ck::utils::check_err(
+            c_m_n_device_result, c_m_n_host_result, "Error: Incorrect results!", rtol, atol);
+    }
+
+    return true;
+}
+
+int run_gemm_splitk_multi_d_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_wmma_gemm(problem_size, config);
+}
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp
index fdf49a31b7..1b8194f838 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -51,9 +51,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShu
 //######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|         Type|
 //######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|             |
 //######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |             | 
-        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               4, ComputeType>;
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   8,   16,   16,    8,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               4, ComputeType>;
 // clang-format on
 
 #include "run_splitK_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return !run_splitK_gemm_example(argc, argv);
+}
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
index dc54bc30ef..8628e8770c 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -50,9 +50,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShu
 //######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
 //######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>;
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   8,   16,   16,    8,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               4>;
 // clang-format on
 
 #include "run_splitK_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return !run_splitK_gemm_example(argc, argv);
+}
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp16_fp8.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp16_fp8.cpp
index b93639e6c1..8091a5b448 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp16_fp8.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -57,4 +57,12 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShu
 
 #include "run_splitK_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return !run_splitK_gemm_example(argc, argv);
+}
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
index 7506f69420..4257451754 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -55,4 +55,11 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShu
 
 #include "run_splitK_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return !run_splitK_gemm_example(argc, argv);
+}
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
index 7ebf914408..f0d4e28ad2 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -89,4 +89,12 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShu
 #define BUILD_INT4_EXAMPLE
 #include "run_splitK_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return !run_splitK_gemm_example(argc, argv);
+}
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
index 6b0c1aa02d..d800443932 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -48,9 +48,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShu
 //######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|        Type|
 //######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|            |
 //######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |            |
-        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,             16,             16,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,             16,             16,      true,           1,           1,                   S<1, 32, 1, 8>,               4, ComputeType>;
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,  16,   16,   16,    8,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,             16,             16,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,             16,             16,      true,           1,           1,                   S<1, 32, 1, 8>,               4, ComputeType>;
 // clang-format on
 
 #include "run_splitK_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return !run_splitK_gemm_example(argc, argv);
+}
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp
index fc55019fc4..ef27c7bb9f 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp
@@ -79,4 +79,11 @@ using DeviceGemmInstance          = ck::tensor_operation::device::DeviceGemmXdlS
 
 #include "run_splitK_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return !run_splitK_gemm_example(argc, argv);
+}
diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
index 26a03f289d..a1b952259f 100644
--- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
@@ -50,14 +50,14 @@ template<> struct emb_kernel<ck::half_t, 8192> { using kernel_type = DeviceInsta
 
 // clang-format on
 
-int main()
+int main(int argc, char* argv[])
 {
     bool time_kernel = true;
 
-    constexpr auto num_rows = 65536;
-    constexpr auto dims     = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{};
-    // constexpr auto dims = ck::Sequence<256, 512>{};
-    constexpr auto index_length   = 2048;
+    ck::index_t num_rows          = 65536;
+    constexpr auto dims           = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{};
+    ck::index_t index_length      = 2048;
+    ck::index_t dim_mask          = 0xffff;
     constexpr AccDataType epsilon = 1e-4;
 
     auto f_host_tensor_desc_1d = [](std::size_t len_) { return HostTensorDescriptor({len_}); };
@@ -73,121 +73,140 @@ int main()
                                                                               BetaDataType,
                                                                               AccDataType,
                                                                               OutType>;
-
+    if(argc == 1)
+    {
+        // Use default value
+    }
+    else if(argc == 4)
+    {
+        num_rows     = atoi(argv[1]);
+        dim_mask     = strtol(argv[2], nullptr, 0);
+        index_length = atoi(argv[3]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1-3: num_rows dim_mask index_length" << std::endl;
+    }
     ck::static_for<0, dims.Size(), 1>{}([&](auto I) {
-        std::srand(std::time(nullptr));
-        constexpr auto current_dim = dims.At(I);
-        Tensor<EmbType> emb_a(f_host_tensor_desc_2d(num_rows, current_dim));
-        Tensor<EmbType> emb_b(f_host_tensor_desc_2d(num_rows, current_dim));
-        Tensor<EmbType> emb_c(f_host_tensor_desc_2d(num_rows, current_dim));
-
-        Tensor<IndexType> index_a(f_host_tensor_desc_1d(index_length));
-        Tensor<IndexType> index_b(f_host_tensor_desc_1d(index_length));
-        Tensor<IndexType> index_c(f_host_tensor_desc_1d(index_length));
-
-        Tensor<GammaDataType> gamma(f_host_tensor_desc_1d(current_dim));
-        Tensor<BetaDataType> beta(f_host_tensor_desc_1d(current_dim));
-
-        Tensor<OutType> out(f_host_tensor_desc_2d(index_length, current_dim));
-
-        emb_a.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
-        emb_b.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
-        emb_c.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
-
-        index_a.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
-        index_b.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
-        index_c.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
-
-        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
-        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
-
-        DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize());
-        DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize());
-        DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize());
-
-        DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize());
-        DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize());
-        DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize());
-
-        DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
-        DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
-
-        DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize());
-
-        emb_a_dev.ToDevice(emb_a.mData.data());
-        emb_b_dev.ToDevice(emb_b.mData.data());
-        emb_c_dev.ToDevice(emb_c.mData.data());
-
-        index_a_dev.ToDevice(index_a.mData.data());
-        index_b_dev.ToDevice(index_b.mData.data());
-        index_c_dev.ToDevice(index_c.mData.data());
-
-        gamma_dev.ToDevice(gamma.mData.data());
-        beta_dev.ToDevice(beta.mData.data());
-
-        auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{};
-        auto argument_ptr    = device_instance.MakeArgumentPointer(
-            out_dev.GetDeviceBuffer(),
-            {ck::type_convert<EmbType*>(emb_a_dev.GetDeviceBuffer()),
-                ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
-                ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
-            {ck::type_convert<IndexType*>(index_a_dev.GetDeviceBuffer()),
-                ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
-                ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
-            gamma_dev.GetDeviceBuffer(),
-            beta_dev.GetDeviceBuffer(),
-            current_dim,
-            index_length,
-            epsilon,
-            EmbElementwiseOperation{});
-        std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString()
-                  << std::endl
-                  << std::flush;
-
-        bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get());
-
-        if(!is_supported)
+        if(dim_mask & (1 << I.value))
         {
-            std::cout << "Runtime parameters are not supported" << std::endl;
-            return;
+            std::srand(std::time(nullptr));
+            constexpr auto current_dim = dims.At(I);
+            Tensor<EmbType> emb_a(f_host_tensor_desc_2d(num_rows, current_dim));
+            Tensor<EmbType> emb_b(f_host_tensor_desc_2d(num_rows, current_dim));
+            Tensor<EmbType> emb_c(f_host_tensor_desc_2d(num_rows, current_dim));
+
+            Tensor<IndexType> index_a(f_host_tensor_desc_1d(index_length));
+            Tensor<IndexType> index_b(f_host_tensor_desc_1d(index_length));
+            Tensor<IndexType> index_c(f_host_tensor_desc_1d(index_length));
+
+            Tensor<GammaDataType> gamma(f_host_tensor_desc_1d(current_dim));
+            Tensor<BetaDataType> beta(f_host_tensor_desc_1d(current_dim));
+
+            Tensor<OutType> out(f_host_tensor_desc_2d(index_length, current_dim));
+
+            emb_a.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+            emb_b.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+            emb_c.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+
+            index_a.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+            index_b.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+            index_c.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+
+            gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+            beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+
+            DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize());
+            DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize());
+            DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize());
+
+            DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize());
+            DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize());
+            DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize());
+
+            DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+            DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+
+            DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize());
+
+            emb_a_dev.ToDevice(emb_a.mData.data());
+            emb_b_dev.ToDevice(emb_b.mData.data());
+            emb_c_dev.ToDevice(emb_c.mData.data());
+
+            index_a_dev.ToDevice(index_a.mData.data());
+            index_b_dev.ToDevice(index_b.mData.data());
+            index_c_dev.ToDevice(index_c.mData.data());
+
+            gamma_dev.ToDevice(gamma.mData.data());
+            beta_dev.ToDevice(beta.mData.data());
+
+            auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{};
+            auto argument_ptr    = device_instance.MakeArgumentPointer(
+                out_dev.GetDeviceBuffer(),
+                {ck::type_convert<EmbType*>(emb_a_dev.GetDeviceBuffer()),
+                    ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
+                    ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
+                {ck::type_convert<IndexType*>(index_a_dev.GetDeviceBuffer()),
+                    ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
+                    ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
+                gamma_dev.GetDeviceBuffer(),
+                beta_dev.GetDeviceBuffer(),
+                current_dim,
+                index_length,
+                epsilon,
+                EmbElementwiseOperation{});
+            std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString()
+                      << std::endl
+                      << std::flush;
+
+            bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get());
+
+            if(!is_supported)
+            {
+                std::cout << "Runtime parameters are not supported" << std::endl;
+                return;
+            }
+
+            auto invoker_ptr = device_instance.MakeInvokerPointer();
+            float time_ms =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            bool pass = true;
+            {
+                Tensor<OutType> out_from_dev(f_host_tensor_desc_2d(index_length, current_dim));
+                ReferenceInstance ref;
+                auto ref_argument = ref.MakeArgument(out,
+                                                     emb_a,
+                                                     emb_b,
+                                                     emb_c,
+                                                     index_a,
+                                                     index_b,
+                                                     index_c,
+                                                     gamma,
+                                                     beta,
+                                                     num_rows,
+                                                     current_dim,
+                                                     index_length,
+                                                     epsilon);
+                auto ref_invoker  = ref.MakeInvoker();
+                ref_invoker.Run(ref_argument);
+
+                out_dev.FromDevice(out_from_dev.mData.data());
+                pass &=
+                    ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3);
+            }
+
+            double total_read = current_dim * index_length * 3 * sizeof(EmbType) +
+                                current_dim * sizeof(GammaDataType) +
+                                current_dim * sizeof(BetaDataType);
+            double total_write = current_dim * index_length * sizeof(OutType);
+            double gbps        = (total_read + total_write) / time_ms / 1e6;
+
+            std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms
+                      << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl
+                      << std::flush;
         }
-
-        auto invoker_ptr = device_instance.MakeInvokerPointer();
-        float time_ms    = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-
-        bool pass = true;
-        {
-            Tensor<OutType> out_from_dev(f_host_tensor_desc_2d(index_length, current_dim));
-            ReferenceInstance ref;
-            auto ref_argument = ref.MakeArgument(out,
-                                                 emb_a,
-                                                 emb_b,
-                                                 emb_c,
-                                                 index_a,
-                                                 index_b,
-                                                 index_c,
-                                                 gamma,
-                                                 beta,
-                                                 num_rows,
-                                                 current_dim,
-                                                 index_length,
-                                                 epsilon);
-            auto ref_invoker  = ref.MakeInvoker();
-            ref_invoker.Run(ref_argument);
-
-            out_dev.FromDevice(out_from_dev.mData.data());
-            pass &= ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3);
-        }
-
-        double total_read = current_dim * index_length * 3 * sizeof(EmbType) +
-                            current_dim * sizeof(GammaDataType) +
-                            current_dim * sizeof(BetaDataType);
-        double total_write = current_dim * index_length * sizeof(OutType);
-        double gbps        = (total_read + total_write) / time_ms / 1e6;
-
-        std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms
-                  << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl
-                  << std::flush;
     });
 
     return 0;
diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
index f27dc60541..4934f74393 100644
--- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
@@ -154,11 +154,11 @@ using DeviceGemmInstance =
         8,           // AK1
         8,           // BK1
         2,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        4,           // Gemm1NXdlPerWave
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        8,           // Gemm1NXdlPerWave
         S<4, 64, 1>, // ABlockTransfer
         S<1, 0, 2>,
         S<1, 0, 2>,
@@ -185,7 +185,7 @@ using DeviceGemmInstance =
         1,              // CShuffleMXdlPerWavePerShuffle
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+        4>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
 
 int main(int argc, char* argv[])
 {
@@ -321,11 +321,13 @@ int main(int argc, char* argv[])
 
         if(std::is_same<decltype(layout), Row>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp
index 4c28e25e01..a377685e52 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
 #include "common.hpp"
@@ -26,7 +26,7 @@ using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDat
 // ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|                    Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
 // ######|            |          |          |           |         |            |            |            |                 |                        |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
 // ######|            |          |          |           |         |            |            |            |                 |                        |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
-         < NDimSpatial, OutLayout, WeiLayout, BiasLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType, ck::Tuple<BiasDataType>, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>;
+         < NDimSpatial, OutLayout, WeiLayout, BiasLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType, ck::Tuple<BiasDataType>, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   16,   16,       4,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4>;
 // clang-format on
 
 #include "run_grouped_conv_bwd_data_bias_relu_example.inc"
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16.cpp b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16.cpp
index b1554412b1..59d94c34bb 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
 #include "common.hpp"
@@ -26,7 +26,7 @@ using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDat
 // ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|         Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
 // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
 // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
-         < NDimSpatial, OutLayout, WeiLayout,   DsLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType,   DsDataType, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>;
+         < NDimSpatial, OutLayout, WeiLayout,   DsLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType,   DsDataType, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   16,   16,       4,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4>;
 // clang-format on
 
 #include "run_grouped_conv_bwd_data_example.inc"
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16_comp_bf8_fp8.cpp b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16_comp_bf8_fp8.cpp
index 41023ef82a..d49fb9befb 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16_comp_bf8_fp8.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16_comp_bf8_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
 #include "common.hpp"
@@ -30,9 +30,17 @@ using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDat
 // ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|         Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector| Scheduler|         Type|         Type|
 // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|          |             |             |
 // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |          |             |             |
-         < NDimSpatial, OutLayout, WeiLayout,   DsLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType,   DsDataType, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8, LoopSched, AComputeType, BComputeType>;
+         < NDimSpatial, OutLayout, WeiLayout,   DsLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType,   DsDataType, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   16,   16,       4,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4, LoopSched, AComputeType, BComputeType>;
 // clang-format on
 
 #include "run_grouped_conv_bwd_data_example.inc"
 
-int main(int argc, char* argv[]) { return run_grouped_conv_bwd_data_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_grouped_conv_bwd_data_example(argc, argv);
+}
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
index 0f0b120cbc..80d56cd781 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
@@ -206,7 +206,8 @@ int run_grouped_conv_bwd_data_bias_relu_example(int argc, char* argv[])
                                                               1,              // c
                                                               0,              // hi
                                                               0               // wi
-                                                          });
+                                                          },
+                                                          ctc::GNCHW{});
 
     // input image: GNHWC
     const auto in_g_n_c_wis_desc =
diff --git a/example/39_permute/permute_1xHxW_fp16.cpp b/example/39_permute/permute_1xHxW_fp16.cpp
index 7336c3b631..30cf4ef083 100644
--- a/example/39_permute/permute_1xHxW_fp16.cpp
+++ b/example/39_permute/permute_1xHxW_fp16.cpp
@@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
 
 #include "run_permute_element_example.inc"
 
-int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); }
+int main(int argc, char* argv[])
+{
+    bool time_kernel = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 2)
+    {
+        time_kernel = std::stoi(argv[1]);
+    }
+    else
+    {
+        printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
+        exit(0);
+    }
+
+    return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}, time_kernel);
+}
diff --git a/example/39_permute/permute_HxWx4_fp16.cpp b/example/39_permute/permute_HxWx4_fp16.cpp
index 6c24919ded..c655384301 100644
--- a/example/39_permute/permute_HxWx4_fp16.cpp
+++ b/example/39_permute/permute_HxWx4_fp16.cpp
@@ -19,4 +19,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
 
 #include "run_permute_bundle_example.inc"
 
-int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); }
+int main(int argc, char* argv[])
+{
+    bool time_kernel = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 2)
+    {
+        time_kernel = std::stoi(argv[1]);
+    }
+    else
+    {
+        printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
+        exit(0);
+    }
+
+    return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}, time_kernel);
+}
diff --git a/example/39_permute/permute_NxHxW_fp16.cpp b/example/39_permute/permute_NxHxW_fp16.cpp
index 3551d2a7c8..d3d7f47ced 100644
--- a/example/39_permute/permute_NxHxW_fp16.cpp
+++ b/example/39_permute/permute_NxHxW_fp16.cpp
@@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
 
 #include "run_permute_element_example.inc"
 
-int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); }
+int main(int argc, char* argv[])
+{
+    bool time_kernel = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 2)
+    {
+        time_kernel = std::stoi(argv[1]);
+    }
+    else
+    {
+        printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
+        exit(0);
+    }
+
+    return !run_permute_element_example({121, 768, 80}, {0, 2, 1}, time_kernel);
+}
diff --git a/example/39_permute/run_permute_bundle_example.inc b/example/39_permute/run_permute_bundle_example.inc
index 2c19872922..fab02f8cf3 100644
--- a/example/39_permute/run_permute_bundle_example.inc
+++ b/example/39_permute/run_permute_bundle_example.inc
@@ -3,7 +3,7 @@
 
 #pragma once
 
-bool run_permute_bundle(const Problem& problem)
+bool run_permute_bundle(const Problem& problem, bool time_kernel)
 {
     const auto& input_bundle_shape = problem.shape;
     const auto& input_bundle_axes  = problem.axes;
@@ -41,7 +41,7 @@ bool run_permute_bundle(const Problem& problem)
     };
 
     auto invoker   = permute.MakeInvoker();
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::cout << "Perf: " << ave_time << " ms" << std::endl;
 
@@ -72,7 +72,9 @@ bool run_permute_bundle(const Problem& problem)
                                 1e-6);
 }
 
-bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes)
+bool run_permute_bundle_example(const Problem::Shape& shape,
+                                const Problem::Axes& axes,
+                                bool time_kernel)
 {
-    return run_permute_bundle(Problem{shape, axes});
+    return run_permute_bundle(Problem{shape, axes}, time_kernel);
 }
diff --git a/example/39_permute/run_permute_element_example.inc b/example/39_permute/run_permute_element_example.inc
index 3587134456..c3f3b972e9 100644
--- a/example/39_permute/run_permute_element_example.inc
+++ b/example/39_permute/run_permute_element_example.inc
@@ -3,7 +3,7 @@
 
 #pragma once
 
-bool run_permute_element(const Problem& problem)
+bool run_permute_element(const Problem& problem, bool time_kernel)
 {
     const auto& input_shape = problem.shape;
     const auto& input_axes  = problem.axes;
@@ -40,7 +40,7 @@ bool run_permute_element(const Problem& problem)
     };
 
     auto invoker   = permute.MakeInvoker();
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::cout << "Perf: " << ave_time << " ms" << std::endl;
 
@@ -59,7 +59,9 @@ bool run_permute_element(const Problem& problem)
                                 1e-6);
 }
 
-bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes)
+bool run_permute_element_example(const Problem::Shape& shape,
+                                 const Problem::Axes& axes,
+                                 bool time_kernel)
 {
-    return run_permute_element(Problem{shape, axes});
+    return run_permute_element(Problem{shape, axes}, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
index 4573c68658..f9a7d9f638 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
@@ -78,8 +78,28 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const auto out_element_op = OutElementOp{ActivationOp{}};
-    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perchannel_quantization_example(
+        out_element_op, do_verification, time_kernel);
 };
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
index 005f6263fd..333987edd6 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
@@ -76,9 +76,28 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float requant_scale       = 0.5f;
     const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
-    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
index 62e5e583de..4b94045421 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
@@ -79,9 +79,29 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float scale_z_inv         = 0.5f;
     const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}};
-    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perchannel_quantization_example(
+        out_element_op, do_verification, time_kernel);
 };
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
index ef98fe7e4f..b74e06b10a 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
@@ -76,10 +76,29 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float scale_acc           = 0.5f;
     float scale_z_inv         = 0.5f;
     const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}};
-    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
index e524ddb2b2..c3ac40a1bc 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
@@ -76,8 +76,27 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const auto out_element_op = OutElementOp{ActivationOp{}};
-    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
index d29a3143c0..437fd6f4c2 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
@@ -71,9 +71,28 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float requant_scale       = 0.5f;
     const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
-    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
index 8c0049b0fa..d9cfae2898 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
@@ -57,10 +57,10 @@ using DeviceGroupedConvNDFwdInstance =
         64,          // KPerBlock
         16,          // AK1
         16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -77,13 +77,33 @@ using DeviceGroupedConvNDFwdInstance =
         1,           // BBlockLdsExtraN
         1,
         1,
-        S<1, 64, 1, 4>,
-        8>;
+        S<1, 32, 1, 8>,
+        4>;
 
 #include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const auto out_element_op = OutElementOp{ActivationOp{}};
-    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perchannel_quantization_example(
+        out_element_op, do_verification, time_kernel);
 };
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
index e18c123f7c..9d3024fce7 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
@@ -55,10 +55,10 @@ using DeviceGroupedConvNDFwdInstance =
         64,          // KPerBlock
         16,          // AK1
         16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -75,14 +75,33 @@ using DeviceGroupedConvNDFwdInstance =
         1,           // BBlockLdsExtraN
         1,
         1,
-        S<1, 64, 1, 4>,
-        8>;
+        S<1, 32, 1, 8>,
+        4>;
 
 #include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float requant_scale       = 0.5f;
     const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
-    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
index 53f810cc9e..2d4ae1f837 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
@@ -55,10 +55,10 @@ using DeviceGroupedConvNDFwdInstance =
         64,          // KPerBlock
         16,          // AK1
         16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -75,13 +75,32 @@ using DeviceGroupedConvNDFwdInstance =
         1,           // BBlockLdsExtraN
         1,
         1,
-        S<1, 64, 1, 4>,
-        8>;
+        S<1, 32, 1, 8>,
+        4>;
 
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const auto out_element_op = OutElementOp{ActivationOp{}};
-    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
index 9db6e201dd..79b0c00fa5 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
@@ -50,10 +50,10 @@ using DeviceGroupedConvNDFwdInstance =
         64,          // KPerBlock
         16,          // AK1
         16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -70,14 +70,33 @@ using DeviceGroupedConvNDFwdInstance =
         1,           // BBlockLdsExtraN
         1,
         1,
-        S<1, 64, 1, 4>,
-        16>;
+        S<1, 32, 1, 8>,
+        4>;
 
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float requant_scale       = 0.5f;
     const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
-    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
index e5b924ad51..3c089688cf 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
@@ -167,10 +167,10 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op)
+int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op,
+                                                        bool do_verification,
+                                                        bool time_kernel)
 {
-    bool do_verification           = true;
-    bool time_kernel               = true;
     const ck::index_t ndim_spatial = 2;
 
     ck::utils::conv::ConvParam conv_param{
@@ -214,7 +214,8 @@ int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_
                                                         1,             // k
                                                         0,             // ho
                                                         0              // wo
-                                                    });
+                                                    },
+                                                    BiasLayout{});
 
     const auto requant_scale_g_k_desc = bias_g_k_desc;
 
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
index 9f3a769dcf..ed7886e76b 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
@@ -155,10 +155,10 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op)
+int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op,
+                                                      bool do_verification,
+                                                      bool time_kernel)
 {
-    bool do_verification           = true;
-    bool time_kernel               = true;
     const ck::index_t ndim_spatial = 2;
 
     ck::utils::conv::ConvParam conv_param{
@@ -201,7 +201,8 @@ int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_el
                                                         1,             // k
                                                         0,             // ho
                                                         0              // wo
-                                                    });
+                                                    },
+                                                    BiasLayout{});
 
     const auto out_g_n_k_wos_desc =
         ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
index 9b08fc690d..12fdf425bf 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
@@ -157,10 +157,10 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op)
+int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op,
+                                                   bool do_verification,
+                                                   bool time_kernel)
 {
-    bool do_verification           = true;
-    bool time_kernel               = true;
     const ck::index_t ndim_spatial = 2;
 
     ck::utils::conv::ConvParam conv_param{
@@ -203,7 +203,8 @@ int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_eleme
                                  1,             // k
                                  0,             // ho
                                  0              // wo
-                             });
+                             },
+                             RequantScaleLayout{});
 
     const auto out_g_n_k_wos_desc =
         ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
index 267c737e00..eae6e996cc 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
@@ -139,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op)
+int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op,
+                                                 bool do_verification,
+                                                 bool time_kernel)
 {
-    bool do_verification           = true;
-    bool time_kernel               = false;
     const ck::index_t ndim_spatial = 2;
 
     ck::utils::conv::ConvParam conv_param{
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
index e37d413695..ba589ec044 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -73,11 +73,11 @@ using DeviceBatchedGemmGemmInstance =
         8,           // AK1
         8,           // BK1
         4,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        4,           // Gemm1NXdlPerWave
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        8,           // Gemm1NXdlPerWave
         S<4, 64, 1>, // ABlockTransfer
         S<1, 0, 2>,
         S<1, 0, 2>,
@@ -102,8 +102,16 @@ using DeviceBatchedGemmGemmInstance =
         1,              // CShuffleMXdlPerWavePerShuffle
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+        4>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
 
 #include "run_grouped_conv_conv_fwd_example.inc"
 
-int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // disable on gfx11 due to precsion issue.
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1;
+}
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
index 496e676a40..847859068f 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -73,11 +73,11 @@ using DeviceBatchedGemmGemmInstance =
         8,           // AK1
         8,           // BK1
         4,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        4,           // Gemm1NXdlPerWave
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        8,           // Gemm1NXdlPerWave
         S<4, 64, 1>, // ABlockTransfer
         S<1, 0, 2>,
         S<1, 0, 2>,
@@ -102,7 +102,7 @@ using DeviceBatchedGemmGemmInstance =
         1,              // CShuffleMXdlPerWavePerShuffle
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+        4>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
 
 #include "run_grouped_conv_conv_fwd_example.inc"
 
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
index 35d50721dc..9a104dbfab 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -106,4 +106,11 @@ using DeviceBatchedGemmGemmInstance =
 
 #include "run_grouped_conv_conv_fwd_example.inc"
 
-int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1;
+}
diff --git a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
index 0722d497d8..852a9bef88 100644
--- a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
+++ b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
@@ -257,7 +257,7 @@ bool run_grouped_conv_conv_fwd(bool do_verification,
 #endif
 
         return ck::utils::check_err(
-            out1_device, out1_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+            out1_device, out1_host, "Error: incorrect results!", 1e-3f, 1.5e-3f);
     }
 
     return true;
diff --git a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
index ab6f317bc6..86e1c8ccc8 100644
--- a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
+++ b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
@@ -11,21 +11,36 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
     ck::index_t G = 64;
     ck::index_t C = 128;
 
+    bool do_verification = true;
+    bool time_kernel     = true;
+    bool log_kernel      = true;
+
     if(argc == 1)
     {
         // use default case
     }
-    else if(argc == 6)
+    else if(argc == 4)
     {
-        N = std::stoi(argv[1]);
-        H = std::stoi(argv[2]);
-        W = std::stoi(argv[3]);
-        G = std::stoi(argv[4]);
-        C = std::stoi(argv[5]);
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+        log_kernel      = std::stoi(argv[3]);
+    }
+    else if(argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+        log_kernel      = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        H               = std::stoi(argv[5]);
+        W               = std::stoi(argv[6]);
+        G               = std::stoi(argv[7]);
+        C               = std::stoi(argv[8]);
     }
     else
     {
-        std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
+        std::cerr << "arg1 = verify(0=no, 1=yes), arg2 = time kernels(0=no, 1=yes), arg3 = log "
+                     "kernels(0=no, 1=yes), arg4 to 8: N, H, W, G, C"
+                  << std::endl;
 
         return 1;
     }
@@ -94,7 +109,8 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
     device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
 
     auto invoker_ptr = device_instance.MakeInvokerPointer();
-    float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
+    float ave_time =
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel, log_kernel});
 
     std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C +
                             sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C +
@@ -106,6 +122,7 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
               << device_instance.GetTypeString() << std::endl;
 
     bool pass = true;
+    if(do_verification)
     {
         Tensor<YDataType> host_y({N, H, W, G, C});
         Tensor<SaveMeanInvStdDataType> host_save_mean(HostTensorDescriptor{N, G});
diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
index ebba88cf41..b5e9686260 100644
--- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
@@ -22,6 +22,9 @@ using S = ck::Sequence<Is...>;
 using F16 = ck::half_t;
 using F32 = float;
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using Add         = ck::tensor_operation::element_wise::Add;
 
@@ -250,19 +253,24 @@ int main(int argc, char* argv[])
 
     Tensor<ADataType> a_gs_ms_ks(
         std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
-        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
+        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()),
+        Row{});
     Tensor<BDataType> b_gs_ns_ks(
         std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
+        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()),
+        Row{});
     Tensor<DDataType> d_gs_ms_ns(
         std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
+        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()),
+        Bypass{});
     Tensor<EDataType> e_gs_ms_ns_host_result(
         std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
+        Bypass{});
     Tensor<EDataType> e_gs_ms_ns_device_result(
         std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
+        Bypass{});
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -372,7 +380,8 @@ int main(int argc, char* argv[])
     {
         Tensor<CShuffleDataType> c_ms_ns_host_result(
             std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
+            Bypass{});
 
         using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
                                                                      NumDimM,
diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
index 4ab26293cc..32a7e4a76e 100644
--- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -22,6 +22,9 @@ using S = ck::Sequence<Is...>;
 using F16 = ck::half_t;
 using F32 = float;
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using Add         = ck::tensor_operation::element_wise::Add;
 
@@ -53,7 +56,7 @@ using DeviceOpInstanceKKNN = ck::tensor_operation::device::
         //############################################|        |        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|        _MBlock_MWaveMPerXdl|  ScalarPerVector|
         //############################################|        |        |        |        |          |          |            |                 |           |          |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|        _NBlock_NWaveNPerXdl|    _NWaveNPerXdl|
         //############################################|        |        |        |        |          |          |            |                 |           |          |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                            |                 |
-        DeviceSplitKContractionMultipleD_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   4,   4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,         1,           1,           1,              S<1, 32, 1, 4>,               4>;
+        DeviceSplitKContractionMultipleD_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   4,   4,   16,   16,    8,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,         1,           1,           1,              S<1, 32, 1, 4>,               2>;
 // clang-format on
 
 using DeviceOpInstance = DeviceOpInstanceKKNN;
@@ -250,19 +253,24 @@ int main(int argc, char* argv[])
 
     Tensor<ADataType> a_gs_ms_ks(
         std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
-        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
+        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()),
+        Row{});
     Tensor<BDataType> b_gs_ns_ks(
         std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
+        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()),
+        Row{});
     Tensor<DDataType> d_gs_ms_ns(
         std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
+        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()),
+        Bypass{});
     Tensor<EDataType> e_gs_ms_ns_host_result(
         std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
+        Bypass{});
     Tensor<EDataType> e_gs_ms_ns_device_result(
         std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
+        Bypass{});
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -372,7 +380,8 @@ int main(int argc, char* argv[])
     {
         Tensor<CShuffleDataType> c_ms_ns_host_result(
             std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
+            Bypass{});
 
         using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
                                                                      NumDimM,
diff --git a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
index 8819bb65e6..8ddd432c11 100644
--- a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
@@ -22,6 +22,8 @@ using F32 = float;
 using ADataType = F16;
 using BDataType = F16;
 
+using NchwLayout  = ck::tensor_layout::convolution::NCHW;
+using NhwcLayout  = ck::tensor_layout::convolution::NHWC;
 using UnaryScale  = ck::tensor_operation::element_wise::Scale;
 using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
 using UnaryScaleSquare =
@@ -44,12 +46,46 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8, 8>,              // InScalarPerVectorSeq
     ck::Sequence<8>>;                // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        nchw[0] = std::stoi(argv[1]);
+        nchw[1] = std::stoi(argv[2]);
+        nchw[2] = std::stoi(argv[3]);
+        nchw[3] = std::stoi(argv[4]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 4: N, C, H, W" << std::endl;
+
+        return 1;
+    }
+
     std::array<ck::index_t, 4> ab_lengths;
     std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
                                              static_cast<int>(nchw[2] * nchw[3]),
@@ -57,11 +93,11 @@ int main()
                                              1};
     ck::ranges::copy(nchw, ab_lengths.begin());
 
-    std::array<Tensor<ADataType>, 2> as = {Tensor<ADataType>(ab_lengths, ab_strides),
-                                           Tensor<ADataType>(ab_lengths, ab_strides)};
+    std::array<Tensor<ADataType>, 2> as = {Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{}),
+                                           Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{})};
     Tensor<ADataType>& a0               = as[0];
     Tensor<ADataType>& a1               = as[1];
-    Tensor<BDataType> b(ab_lengths, ab_strides);
+    Tensor<BDataType> b(ab_lengths, ab_strides, NchwLayout{});
     float alpha = 3.f;
     float beta  = 2.f;
     a0.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
@@ -118,7 +154,7 @@ int main()
 
     if(do_verification)
     {
-        Tensor<BDataType> host_b(ab_lengths, ab_strides);
+        Tensor<BDataType> host_b(ab_lengths, ab_strides, NchwLayout{});
 
         using ReferenceElementwiseInstance = ck::tensor_operation::host::
             ReferenceElementwise<2, ADataType, BDataType, BinaryAddUnaryScaleSquare>;
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
index 3ea1aa4bf8..2d689648f2 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
@@ -22,6 +22,8 @@ using F32 = float;
 using ADataType = F16;
 using BDataType = F16;
 
+using NchwLayout                       = ck::tensor_layout::convolution::NCHW;
+using NhwcLayout                       = ck::tensor_layout::convolution::NHWC;
 using PassThrough                      = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
     ck::Tuple<ADataType>, // InDataTypeTuple
@@ -37,11 +39,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8>,      // InScalarPerVectorSeq
     ck::Sequence<8>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 128};
 
@@ -56,9 +74,9 @@ int main()
                                             static_cast<int>(nhwc[3])};
     ck::ranges::copy(nchw, ab_lengths.begin());
 
-    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
+    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
     Tensor<ADataType>& a                = as[0];
-    Tensor<BDataType> b(ab_lengths, b_strides);
+    Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
 
     a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
 
@@ -101,7 +119,7 @@ int main()
 
     if(do_verification)
     {
-        Tensor<BDataType> host_b(ab_lengths, b_strides);
+        Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
         using ReferenceElementwiseInstance =
             ck::tensor_operation::host::ReferenceElementwise<1, ADataType, BDataType, PassThrough>;
         auto ref_elementwise = ReferenceElementwiseInstance{};
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
index 13c67fce05..6e70a306d3 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
@@ -23,6 +23,8 @@ using F32 = float;
 using ADataType = F16;
 using BDataType = F16;
 
+using NchwLayout  = ck::tensor_layout::convolution::NCHW;
+using NhwcLayout  = ck::tensor_layout::convolution::NHWC;
 using UnaryScale  = ck::tensor_operation::element_wise::Scale;
 using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
 using UnaryScaleSquare =
@@ -41,11 +43,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8>,      // InScalarPerVectorSeq
     ck::Sequence<8>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 8, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 8};
     std::array<ck::index_t, 4> ab_lengths;
@@ -60,9 +78,9 @@ int main()
                                             static_cast<int>(nhwc[0] * nhwc[1])};
     ck::ranges::copy(nchw, ab_lengths.begin());
 
-    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
+    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
     Tensor<ADataType>& a                = as[0];
-    Tensor<BDataType> b(ab_lengths, b_strides);
+    Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
     float scale = 1.f;
     auto i      = 0;
     std::mt19937 gen(11939);
@@ -121,7 +139,7 @@ int main()
 
     if(do_verification)
     {
-        Tensor<BDataType> host_b(ab_lengths, b_strides);
+        Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
         using ReferenceElementwiseInstance = ck::tensor_operation::host::
             ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
         auto ref_elementwise = ReferenceElementwiseInstance{};
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
index 0a0f6fec10..632d88e88a 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
@@ -22,6 +22,9 @@ using F32 = float;
 using ADataType = F16;
 using BDataType = F16;
 
+using NchwLayout = ck::tensor_layout::convolution::NCHW;
+using NhwcLayout = ck::tensor_layout::convolution::NHWC;
+
 using UnaryScale  = ck::tensor_operation::element_wise::Scale;
 using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
 using UnaryScaleSquare =
@@ -40,11 +43,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8>,      // InScalarPerVectorSeq
     ck::Sequence<8>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 128};
 
@@ -60,9 +79,9 @@ int main()
 
     ck::ranges::copy(nchw, ab_lengths.begin());
 
-    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
+    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
     Tensor<ADataType>& a                = as[0];
-    Tensor<BDataType> b(ab_lengths, b_strides);
+    Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
 
     float scale = 2.f;
     a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
@@ -112,7 +131,7 @@ int main()
 
     if(do_verification)
     {
-        Tensor<BDataType> host_b(ab_lengths, b_strides);
+        Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
         using ReferenceElementwiseInstance = ck::tensor_operation::host::
             ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
         auto ref_elementwise = ReferenceElementwiseInstance{};
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
index fc664186be..bd54f1c19c 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
@@ -22,6 +22,8 @@ using F32 = float;
 using ADataType = F32;
 using BDataType = F32;
 
+using NchwLayout  = ck::tensor_layout::convolution::NCHW;
+using NhwcLayout  = ck::tensor_layout::convolution::NHWC;
 using UnaryScale  = ck::tensor_operation::element_wise::Scale;
 using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
 using UnaryScaleSquare =
@@ -40,11 +42,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<1>,      // InScalarPerVectorSeq
     ck::Sequence<1>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 8, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 8};
     std::array<ck::index_t, 4> ab_lengths;
@@ -60,9 +78,9 @@ int main()
                                             static_cast<int>(nhwc[0] * nhwc[1])};
     ck::ranges::copy(nchw, ab_lengths.begin());
 
-    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
+    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
     Tensor<ADataType>& a                = as[0];
-    Tensor<BDataType> b(ab_lengths, b_strides);
+    Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
 
     float scale = 1.f;
     auto i      = 0;
@@ -123,7 +141,7 @@ int main()
 
     if(do_verification)
     {
-        Tensor<BDataType> host_b(ab_lengths, b_strides);
+        Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
         using ReferenceElementwiseInstance = ck::tensor_operation::host::
             ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
         auto ref_elementwise = ReferenceElementwiseInstance{};
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
index a0c416318a..9621d591a9 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
@@ -22,6 +22,9 @@ using F32 = float;
 using ADataType = F32;
 using BDataType = F32;
 
+using NchwLayout = ck::tensor_layout::convolution::NCHW;
+using NhwcLayout = ck::tensor_layout::convolution::NHWC;
+
 using UnaryScale  = ck::tensor_operation::element_wise::Scale;
 using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
 using UnaryScaleSquare =
@@ -40,11 +43,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8>,      // InScalarPerVectorSeq
     ck::Sequence<8>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 128};
 
@@ -60,9 +79,9 @@ int main()
 
     ck::ranges::copy(nchw, ab_lengths.begin());
 
-    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
+    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
     Tensor<ADataType>& a                = as[0];
-    Tensor<BDataType> b(ab_lengths, b_strides);
+    Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
     float scale = 2.f;
     a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
 
@@ -111,7 +130,7 @@ int main()
 
     if(do_verification)
     {
-        Tensor<BDataType> host_b(ab_lengths, b_strides);
+        Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
         using ReferenceElementwiseInstance = ck::tensor_operation::host::
             ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
         auto ref_elementwise = ReferenceElementwiseInstance{};
diff --git a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
index c40447e1f9..0619cc7139 100644
--- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
@@ -119,6 +119,22 @@ int main(int argc, char* argv[])
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const float scale = 2.f;
 
     ck::index_t M = 1024;
diff --git a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp
index 050300eed2..be4014f636 100644
--- a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp
@@ -22,6 +22,9 @@ using F32 = float;
 using ADataType = F16;
 using BDataType = F16;
 
+using NchwLayout = ck::tensor_layout::convolution::NCHW;
+using NhwcLayout = ck::tensor_layout::convolution::NHWC;
+
 using UnaryScale  = ck::tensor_operation::element_wise::Scale;
 using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
 using UnaryScaleSquare =
@@ -48,11 +51,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8, 8, 8>,                      // InScalarPerVectorSeq
     ck::Sequence<8>>;                           // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
     std::array<ck::index_t, 4> ab_lengths;
     std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
@@ -62,13 +81,13 @@ int main()
 
     ck::ranges::copy(nchw, ab_lengths.begin());
 
-    std::array<Tensor<ADataType>, 3> as = {Tensor<ADataType>(ab_lengths, ab_strides),
-                                           Tensor<ADataType>(ab_lengths, ab_strides),
-                                           Tensor<ADataType>(ab_lengths, ab_strides)};
+    std::array<Tensor<ADataType>, 3> as = {Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{}),
+                                           Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{}),
+                                           Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{})};
     Tensor<ADataType>& a0               = as[0];
     Tensor<ADataType>& a1               = as[1];
     Tensor<ADataType>& a2               = as[2];
-    Tensor<BDataType> b(ab_lengths, ab_strides);
+    Tensor<BDataType> b(ab_lengths, ab_strides, NchwLayout{});
     float alpha = 3.f;
     float beta  = 2.f;
     float gamma = 4.f;
@@ -133,7 +152,7 @@ int main()
 
     if(do_verification)
     {
-        Tensor<BDataType> host_b(ab_lengths, ab_strides);
+        Tensor<BDataType> host_b(ab_lengths, ab_strides, NchwLayout{});
         using ReferenceElementwiseInstance = ck::tensor_operation::host::
             ReferenceElementwise<3, ADataType, BDataType, TrinaryAddUnaryScaleSquare>;
         auto ref_elementwise = ReferenceElementwiseInstance{};
diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
index c02d540983..8064809123 100644
--- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -77,12 +77,44 @@ void host_elementwise2D(HostTensorC& C,
         }
 }
 
-int main()
+int main(int argc, char* argv[])
 {
-    bool time_kernel = true;
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    ck::index_t M = 48 * 256;
+    ck::index_t N = 1024;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 3)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 2: M, N" << std::endl;
+        return 1;
+    }
 
-    ck::index_t M      = 48 * 256;
-    ck::index_t N      = 1024;
     ck::index_t Stride = N;
 
     auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
@@ -157,6 +189,7 @@ int main()
     std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl;
 
     bool pass = true;
+    if(do_verification)
     {
         std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
                                        static_cast<unsigned long>(N)};
diff --git a/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp b/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
index 56417b101d..4d73f0c35f 100644
--- a/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
+++ b/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
@@ -31,7 +31,7 @@ using DeviceOpInstance = ck::tensor_operation::device::
         //##############################| Layout| Layout|   Layout| Layout|  Type|  Type|    Type| DataType|       Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //##############################|       |       |         |       |      |      |        |         |           |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //##############################|       |       |         |       |      |      |        |         |           |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, DsLayout,    Row,   F16,   F16,     F32,      F16, DsDataType,   F16, PassThrough, PassThrough, CDEElementOp,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>;
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, DsLayout,    Row,   F16,   F16,     F32,      F16, DsDataType,   F16, PassThrough, PassThrough, CDEElementOp,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   16,   16,    8,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               4>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
diff --git a/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc b/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
index e1b2bccfe1..bba6ae14a4 100644
--- a/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
+++ b/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
@@ -1,22 +1,30 @@
 #pragma once
+#include <type_traits>
 
 bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
     using namespace ck::literals;
 
-    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+    ProblemSize ps =
+        problem_size; // make mutable copy because default stride values of 0 need to be updated
+    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;
 
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
+    auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) {
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
+            if(stride <= 0)
+                stride = desc.GetStrides()[0];
+            return desc;
+        }
+        else
+        {
+            auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
+            if(stride <= 0)
+                stride = desc.GetStrides()[1];
+            return desc;
+        }
+    };
 
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
@@ -123,7 +131,16 @@ bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfi
 
         e_device_buf.FromDevice(e_m_n_device_result.mData.data());
 
-        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+        if(std::is_same_v<ck::ranges::range_value_t<decltype(e_m_n_device_result)>, ck::half_t> &&
+           std::is_same_v<ck::ranges::range_value_t<decltype(e_m_n_host_result)>, ck::half_t>)
+        {
+            return ck::utils::check_err(
+                e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 5e-3, 1e-3);
+        }
+        else
+        {
+            return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+        }
     }
 
     return true;
diff --git a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp
index 392cb155cb..3e69caf51e 100644
--- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp
+++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
@@ -18,6 +18,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
@@ -91,11 +95,11 @@ using DeviceOpInstance =
         8,           // AK1
         8,           // BK1
         2,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        2,           // Gemm1NXdlPerWave
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
         S<4, 64, 1>, // ABlockTransfer
         S<1, 0, 2>,
         S<1, 0, 2>,
@@ -120,7 +124,7 @@ using DeviceOpInstance =
         1,              // CShuffleMXdlPerWavePerShuffle
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
         MaskingSpec,    // MaskingSpecialization
         1>;
 
@@ -159,6 +163,12 @@ int main(int argc, char* argv[])
     int O       = 64;
     float alpha = 1;
 
+    // temp disable on gfx11, d0_gs_ms_ns isn't handled correctly when it is not a constant.
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+
     if(argc == 1)
     {
         // use default case
@@ -214,12 +224,12 @@ int main(int argc, char* argv[])
     std::vector<ck::index_t> d0_gs_ms_ns_lengths{G0, G1, M, N};
     std::vector<ck::index_t> d0_gs_ms_ns_strides{M * G1 * N, N, G1 * N, 1};
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-    Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides);
-    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
-    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
+    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, Row{});
+    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, Col{});
+    Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides, Row{});
+    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Row{});
+    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Row{});
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
diff --git a/example/48_pool3d_fwd/pool3d_fwd_common.hpp b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
index 788f38ec52..ef64dd167d 100644
--- a/example/48_pool3d_fwd/pool3d_fwd_common.hpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
@@ -48,15 +48,16 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
 
     if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
     {
-        return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
+        return HostTensorDescriptor(
+            {N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz}, layout);
     }
     else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
     {
-        return HostTensorDescriptor({N_, C_, D, H, W},
-                                    {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+        return HostTensorDescriptor(
+            {N_, C_, D, H, W}, {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}, layout);
     }
     throw std::runtime_error("Pool3d_fwd: problem with layout. ");
-    return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0});
+    return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, layout);
 };
 
 template <typename DevicePoolFwdInstance,
diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
index 2c1e669375..51b7f0015d 100644
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
@@ -77,7 +77,9 @@ bool maxpool_bwd_test(bool do_verification,
         [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
             using namespace ck::literals;
             // reference need Tensor with NCHW order
-            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+            return HostTensorDescriptor({N_, C_, H, W},
+                                        {C_ * H * W, 1_uz, W * C_, C_},
+                                        ck::tensor_layout::convolution::NCHW{});
         };
 
     // in
diff --git a/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp b/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
index 032828f7bc..940a21acc9 100644
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
@@ -42,15 +42,16 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
 
     if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
     {
-        return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
+        return HostTensorDescriptor(
+            {N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz}, layout);
     }
     else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
     {
-        return HostTensorDescriptor({N_, C_, D, H, W},
-                                    {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+        return HostTensorDescriptor(
+            {N_, C_, D, H, W}, {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}, layout);
     }
     throw std::runtime_error("Avgpool3d_bwd: problem with layout. ");
-    return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0});
+    return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, layout);
 };
 
 template <typename DevicePoolBwdInstance,
diff --git a/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp b/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp
index 6cf1b2ff91..dcbb472118 100644
--- a/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp
+++ b/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp
@@ -100,7 +100,7 @@ using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizatio
     4,     // DGammaDstVectorSize
     4>;    // DBetaDstVectorSize
 
-int main()
+int main(int argc, char* argv[])
 {
     bool time_kernel = false;
 
@@ -110,6 +110,25 @@ int main()
     ck::index_t G = 32;
     ck::index_t C = 64;
 
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 6)
+    {
+        N = std::stoi(argv[1]);
+        H = std::stoi(argv[2]);
+        W = std::stoi(argv[3]);
+        G = std::stoi(argv[4]);
+        C = std::stoi(argv[5]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
+
+        return 1;
+    }
+
     Tensor<DYDataType> dy({N, H, W, G, C});
     Tensor<XDataType> x({N, H, W, G, C});
     Tensor<GammaDataType> gamma({G, C});
diff --git a/example/60_gemm_multi_ABD/CMakeLists.txt b/example/60_gemm_multi_ABD/CMakeLists.txt
index a9e0d3f9ad..ffc6cec61d 100644
--- a/example/60_gemm_multi_ABD/CMakeLists.txt
+++ b/example/60_gemm_multi_ABD/CMakeLists.txt
@@ -1,3 +1,7 @@
+add_example_executable(example_gemm_multi_ABD_wmma_fp16 gemm_multi_ABD_wmma_fp16.cpp)
+add_example_executable(example_gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8 gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp)
+add_example_executable(example_gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8 gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp)
+add_example_executable(example_gemm_multi_ABD_wmma_fastgelu_bf16_i8 gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp)
 add_example_executable(example_gemm_multi_ABD_xdl_fp16 gemm_multi_ABD_xdl_fp16.cpp)
 add_example_executable(example_gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8 gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp)
 add_example_executable(example_gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8 gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp)
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp
new file mode 100644
index 0000000000..a30314f58c
--- /dev/null
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using D0DataType       = BF16;
+using DsDataType       = ck::Tuple<D0DataType>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using D0Layout = Row;
+using DsLayout = ck::Tuple<D0Layout>;
+using ELayout  = Row;
+
+using Multiply    = ck::tensor_operation::element_wise::Multiply;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = Multiply;
+using CDEElementOp = AddFastGelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Wmma_CShuffleV3<
+    AsLayout,
+    BsLayout,
+    DsLayout,
+    ELayout,
+    AsDataType,
+    BsDataType,
+    AccDataType,
+    CShuffleDataType,
+    DsDataType,
+    EDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmSpec,
+    256,
+    128,
+    128,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<8, 32, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    0,
+    S<8, 32, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    0,
+    1,
+    1,
+    S<1, 32, 1, 8>,
+    S<8, 8, 8>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v3>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 2;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 4096;
+    ck::index_t N = 768;
+    ck::index_t K = 6144;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideD = N;
+    ck::index_t StrideE = N;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB, B1Layout{}));
+    Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{0, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(D0DataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_k_n.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumATensor = 1;
+    constexpr ck::index_t NumBTensor = 2;
+    constexpr ck::index_t NumDTensor = 1;
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(std::array<const void*, NumATensor>{a0_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumBTensor>{b0_device_buf.GetDeviceBuffer(),
+                                                                   b1_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumDTensor>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               std::array<ck::index_t, NumATensor>{StrideA},
+                               std::array<ck::index_t, NumBTensor>{StrideB, StrideB},
+                               std::array<ck::index_t, NumDTensor>{StrideD},
+                               StrideE,
+                               1,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+
+        for(int n = 0; n < N; ++n)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B1DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp
new file mode 100644
index 0000000000..086a0f4834
--- /dev/null
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using Multiply    = ck::tensor_operation::element_wise::Multiply;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = Multiply;
+using CDEElementOp = FastGelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Wmma_CShuffleV3<
+    AsLayout,
+    BsLayout,
+    DsLayout,
+    ELayout,
+    AsDataType,
+    BsDataType,
+    AccDataType,
+    CShuffleDataType,
+    DsDataType,
+    EDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmSpec,
+    256,
+    128,
+    128,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<8, 32, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    0,
+    S<8, 32, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    0,
+    1,
+    1,
+    S<1, 32, 1, 8>,
+    S<8, 8, 8>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v3>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 2;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 4096;
+    ck::index_t N = 768;
+    ck::index_t K = 6144;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideE = N;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB, B1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{0, 5});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_k_n.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumATensor = 1;
+    constexpr ck::index_t NumBTensor = 2;
+    constexpr ck::index_t NumDTensor = 0;
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(std::array<const void*, NumATensor>{a0_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumBTensor>{b0_device_buf.GetDeviceBuffer(),
+                                                                   b1_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumDTensor>{},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               std::array<ck::index_t, NumATensor>{StrideA},
+                               std::array<ck::index_t, NumBTensor>{StrideB, StrideB},
+                               std::array<ck::index_t, NumDTensor>{},
+                               StrideE,
+                               1,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        Tensor<A0DataType> a_m_k({M, K});
+
+        Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+
+        for(int n = 0; n < N; ++n)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B1DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp
new file mode 100644
index 0000000000..32345d1263
--- /dev/null
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using EDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Row;
+using DLayout = Row;
+using ELayout = Row;
+
+struct AddScale
+{
+    static constexpr auto I0 = ck::Number<0>{};
+    static constexpr auto I1 = ck::Number<1>{};
+    static constexpr auto I2 = ck::Number<2>{};
+    static constexpr auto I3 = ck::Number<3>{};
+
+    __host__ __device__ constexpr void
+    operator()(ck::half4_t& a, const ck::half4_t& a0, const ck::half4_t& a1) const
+    {
+        const auto a0_v_t = ck::vector_type<ck::half_t, 4>{a0};
+        const auto a1_v_t = ck::vector_type<ck::half_t, 4>{a1};
+
+        auto r_v_t = ck::vector_type<ck::half_t, 4>{};
+
+        r_v_t.AsType<ck::half_t>()(I0) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I0] + a1_v_t.AsType<ck::half_t>()[I0]);
+        r_v_t.AsType<ck::half_t>()(I1) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I1] + a1_v_t.AsType<ck::half_t>()[I1]);
+        r_v_t.AsType<ck::half_t>()(I2) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I2] + a1_v_t.AsType<ck::half_t>()[I2]);
+        r_v_t.AsType<ck::half_t>()(I3) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I3] + a1_v_t.AsType<ck::half_t>()[I3]);
+
+        a = r_v_t.AsType<ck::half4_t>()[I0];
+    }
+
+    __host__ __device__ constexpr void
+    operator()(ck::half_t& a, const ck::half_t& a0, const ck::half_t& a1) const
+    {
+        a = scale * (a0 + a1);
+    }
+
+    // this attribute controls the copy_function applying element_wise_op with
+    // pack4_data
+    constexpr const static bool is_pack4_invocable = true;
+
+    float scale = 1.0;
+};
+
+struct AlphaBetaAdd
+{
+    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename C, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, float, ck::half_t>(
+        ck::half_t& e, const float& c, const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * c + beta_ * ck::type_convert<float>(d));
+    };
+
+    float alpha_;
+    float beta_;
+};
+
+using AElementOp   = AddScale;
+using BElementOp   = PassThrough;
+using CDEElementOp = AlphaBetaAdd;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Wmma_CShuffleV3<
+    ck::Tuple<ALayout, ALayout>,
+    ck::Tuple<BLayout>,
+    ck::Tuple<DLayout>,
+    ELayout,
+    ck::Tuple<ADataType, ADataType>,
+    ck::Tuple<BDataType>,
+    AccDataType,
+    CShuffleDataType,
+    ck::Tuple<DDataType>,
+    EDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmSpec,
+    256,
+    256,
+    128,
+    32,
+    8,
+    8,
+    16,
+    16,
+    4,
+    4,
+    S<4, 64, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    0,
+    S<4, 64, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    1,
+    1,
+    8,
+    0,
+    1,
+    1,
+    S<1, 64, 1, 4>,
+    S<8, 8, 8>>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideD = N;
+    ck::index_t StrideE = N;
+
+    float alpha = 1.0f;
+    float beta  = 1.0f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        alpha = std::stof(argv[4]);
+        beta  = std::stof(argv[5]);
+    }
+    else if(argc == 13)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+
+        alpha = std::stof(argv[11]);
+        beta  = std::stof(argv[12]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 12: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
+               "beta\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<ADataType> a1_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(ADataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(ADataType) * a1_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    a1_device_buf.ToDevice(a1_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{0.2};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{alpha, beta};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
+                                                          a1_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, 1>{b_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               std::array<ck::index_t, 2>{StrideA, StrideA},
+                               std::array<ck::index_t, 1>{StrideB},
+                               std::array<ck::index_t, 1>{StrideD},
+                               StrideE,
+                               1,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        Tensor<ADataType> a_m_k({M, K});
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                a_element_op(a_m_k(m, k), a0_m_k(m, k), a1_m_k(m, k));
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp
new file mode 100644
index 0000000000..00e2d7e33c
--- /dev/null
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using BsDataType       = ck::Tuple<B0DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = BF16;
+using D1DataType       = BF16;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using BsLayout = ck::Tuple<B0Layout>;
+using D0Layout = Row;
+using D1Layout = D0Layout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
+using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MultiplyAddFastGelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Wmma_CShuffleV3<
+    AsLayout,
+    BsLayout,
+    DsLayout,
+    ELayout,
+    AsDataType,
+    BsDataType,
+    AccDataType,
+    CShuffleDataType,
+    DsDataType,
+    EDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmSpec,
+    256,
+    128,
+    128,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<8, 32, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    0,
+    S<8, 32, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    0,
+    1,
+    1,
+    S<1, 32, 1, 8>,
+    S<8, 8, 8>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v3>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 4096;
+    ck::index_t N = 768;
+    ck::index_t K = 6144;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideD = N;
+    ck::index_t StrideE = N;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumATensor = 1;
+    constexpr ck::index_t NumBTensor = 1;
+    constexpr ck::index_t NumDTensor = 2;
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(std::array<const void*, NumATensor>{a0_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumBTensor>{b0_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               std::array<ck::index_t, NumATensor>{StrideA},
+                               std::array<ck::index_t, NumBTensor>{StrideB},
+                               std::array<ck::index_t, NumDTensor>{StrideD, StrideD},
+                               StrideE,
+                               1,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B0DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp
index 5f3bba922f..405eac7df1 100644
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -67,7 +67,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl
 ///######|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 ///######|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 ///######|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,               8,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v4>;
+         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,               4,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v4>;
 // clang-format on
 
 int main(int argc, char* argv[])
@@ -81,10 +81,11 @@ int main(int argc, char* argv[])
     ck::index_t N = 768;
     ck::index_t K = 6144;
 
-    ck::index_t StrideA = K;
-    ck::index_t StrideB = N;
-    ck::index_t StrideD = 0;
-    ck::index_t StrideE = N;
+    ck::index_t StrideA  = K;
+    ck::index_t StrideB  = N;
+    ck::index_t StrideB1 = 0;
+    ck::index_t StrideD  = 0;
+    ck::index_t StrideE  = N;
 
     if(argc == 1)
     {
@@ -120,23 +121,31 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
+    auto f_host_tensor_descriptor = [](std::size_t row,
+                                       std::size_t col,
+                                       ck::index_t& stride,
+                                       auto layout) {
+        using namespace ck::literals;
 
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
+            if(stride <= 0)
+                stride = desc.GetStrides()[0];
+            return desc;
+        }
+        else
+        {
+            auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
+            if(stride <= 0)
+                stride = desc.GetStrides()[1];
+            return desc;
+        }
+    };
 
     Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
     Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
-    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
     Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
     Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
     Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
@@ -196,7 +205,7 @@ int main(int argc, char* argv[])
                                N,
                                K,
                                std::array<ck::index_t, NumATensor>{StrideA},
-                               std::array<ck::index_t, NumBTensor>{StrideB, 0},
+                               std::array<ck::index_t, NumBTensor>{StrideB, StrideB1},
                                std::array<ck::index_t, NumDTensor>{StrideD},
                                StrideE,
                                a_element_op,
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp
index 95cf8f3674..50e670bdf3 100644
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -67,7 +67,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl
 ///######|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 ///######|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 ///######|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,               8,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v4>;
+         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,               4,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v4>;
 // clang-format on
 
 int main(int argc, char* argv[])
@@ -81,10 +81,11 @@ int main(int argc, char* argv[])
     ck::index_t N = 768;
     ck::index_t K = 6144;
 
-    ck::index_t StrideA = K;
-    ck::index_t StrideB = N;
-    ck::index_t StrideD = 0;
-    ck::index_t StrideE = N;
+    ck::index_t StrideA  = K;
+    ck::index_t StrideB  = N;
+    ck::index_t StrideB1 = 0;
+    ck::index_t StrideD  = 0;
+    ck::index_t StrideE  = N;
 
     if(argc == 1)
     {
@@ -120,23 +121,31 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
+    auto f_host_tensor_descriptor = [](std::size_t row,
+                                       std::size_t col,
+                                       ck::index_t& stride,
+                                       auto layout) {
+        using namespace ck::literals;
 
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
+            if(stride <= 0)
+                stride = desc.GetStrides()[0];
+            return desc;
+        }
+        else
+        {
+            auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
+            if(stride <= 0)
+                stride = desc.GetStrides()[1];
+            return desc;
+        }
+    };
 
     Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
     Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
-    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
     Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
     Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
     Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
@@ -196,7 +205,7 @@ int main(int argc, char* argv[])
                                N,
                                K,
                                std::array<ck::index_t, NumATensor>{StrideA},
-                               std::array<ck::index_t, NumBTensor>{StrideB, 0},
+                               std::array<ck::index_t, NumBTensor>{StrideB, StrideB1},
                                std::array<ck::index_t, NumDTensor>{},
                                StrideE,
                                a_element_op,
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
index 2582ea8a11..2a44c8ad2a 100644
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
@@ -127,10 +127,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl
     32,
     8,
     8,
-    32,
-    32,
+    16,
+    16,
+    8,
     4,
-    2,
     S<4, 64, 1>,
     S<1, 0, 2>,
     S<1, 0, 2>,
@@ -148,7 +148,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl
     1,
     1,
     S<1, 32, 1, 8>,
-    8>;
+    4>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
index 07b9db4620..50e1c21c8f 100644
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -66,7 +66,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl
 ///######|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 ///######|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 ///######|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,               8,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v4>;
+         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,               4,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v4>;
 // clang-format on
 
 int main(int argc, char* argv[])
@@ -80,10 +80,11 @@ int main(int argc, char* argv[])
     ck::index_t N = 768;
     ck::index_t K = 6144;
 
-    ck::index_t StrideA = K;
-    ck::index_t StrideB = N;
-    ck::index_t StrideD = 0;
-    ck::index_t StrideE = N;
+    ck::index_t StrideA  = K;
+    ck::index_t StrideB  = N;
+    ck::index_t StrideB1 = 0;
+    ck::index_t StrideD  = 0;
+    ck::index_t StrideE  = N;
 
     if(argc == 1)
     {
@@ -119,23 +120,31 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
+    auto f_host_tensor_descriptor = [](std::size_t row,
+                                       std::size_t col,
+                                       ck::index_t& stride,
+                                       auto layout) {
+        using namespace ck::literals;
 
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
+            if(stride <= 0)
+                stride = desc.GetStrides()[0];
+            return desc;
+        }
+        else
+        {
+            auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
+            if(stride <= 0)
+                stride = desc.GetStrides()[1];
+            return desc;
+        }
+    };
 
     Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
     Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
-    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
     Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
     Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
     Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
@@ -196,7 +205,7 @@ int main(int argc, char* argv[])
                                K,
                                std::array<ck::index_t, NumATensor>{StrideA},
                                std::array<ck::index_t, NumBTensor>{StrideB},
-                               std::array<ck::index_t, NumDTensor>{0, StrideD},
+                               std::array<ck::index_t, NumDTensor>{StrideB1, StrideD},
                                StrideE,
                                a_element_op,
                                b_element_op,
@@ -261,7 +270,7 @@ int main(int argc, char* argv[])
         {
             for(int n = 0; n < N; ++n)
             {
-                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), b1_k_n(0, n), d_m_n(m, n));
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), b1_k_n(m, n), d_m_n(m, n));
             }
         }
 
diff --git a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
index 57e2feb084..a9a30b4c27 100644
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -19,6 +19,9 @@
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/numeric.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
@@ -94,10 +97,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceContractionMultiple
     32,
     8,
     8,
-    32,
-    32,
+    16,
+    16,
+    8,
     4,
-    2,
     S<4, 64, 1>,
     S<1, 0, 2>,
     S<1, 0, 2>,
@@ -115,7 +118,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceContractionMultiple
     1,
     1,
     S<1, 32, 1, 8>,
-    8>;
+    4>;
 
 int main(int argc, char* argv[])
 {
@@ -160,12 +163,12 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
-    Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides);
-    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
+    Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides, Bypass{});
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
+    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides, Row{});
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
     std::cout << "a0_ms_ks: " << a0_ms_ks.mDesc << std::endl;
     std::cout << "a1_ms_ks: " << a1_ms_ks.mDesc << std::endl;
@@ -264,9 +267,9 @@ int main(int argc, char* argv[])
     if(do_verification)
     {
 
-        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
-        Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
+        Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
 
         for(size_t m0 = 0; m0 < a_ms_ks.mDesc.GetLengths()[0]; ++m0)
         {
@@ -299,7 +302,6 @@ int main(int argc, char* argv[])
         auto ref_op      = ReferenceOpInstance{};
         auto ref_invoker = ref_op.MakeInvoker();
 
-        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
         auto ref_argument =
             ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, PassThrough{}, b_element_op);
 
diff --git a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp
index ec1b2d6018..4f7414abfa 100644
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp
@@ -19,6 +19,9 @@
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/numeric.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
@@ -140,12 +143,12 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
-    Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides);
-    Tensor<B0DataType> b0_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides);
-    Tensor<B1DataType> b1_ns_ks(b1_ns_ks_lengths, b1_ns_ks_strides);
-    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
+    Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides, Bypass{});
+    Tensor<B0DataType> b0_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides, Row{});
+    Tensor<B1DataType> b1_ns_ks(b1_ns_ks_lengths, b1_ns_ks_strides, Row{});
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
     std::cout << "a0_ms_ks: " << a0_ms_ks.mDesc << std::endl;
     std::cout << "a1_ms_ks: " << a1_ms_ks.mDesc << std::endl;
@@ -246,9 +249,9 @@ int main(int argc, char* argv[])
     if(do_verification)
     {
 
-        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
 
-        Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
+        Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
 
         for(size_t m0 = 0; m0 < a_ms_ks.mDesc.GetLengths()[0]; ++m0)
         {
@@ -266,7 +269,7 @@ int main(int argc, char* argv[])
             }
         }
 
-        Tensor<B0DataType> b_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides);
+        Tensor<B0DataType> b_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides, Row{});
 
         for(size_t n0 = 0; n0 < b_ns_ks.mDesc.GetLengths()[0]; ++n0)
         {
diff --git a/example/62_convnd_activ/binary/CMakeLists.txt b/example/62_convnd_activ/binary/CMakeLists.txt
index b9584be89c..f23f908883 100644
--- a/example/62_convnd_activ/binary/CMakeLists.txt
+++ b/example/62_convnd_activ/binary/CMakeLists.txt
@@ -1,15 +1,9 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-      add_custom_target(example_convnd_activ_binary_xdl)
-      # Bilinear residual
-      add_example_executable(example_convnd_fwd_xdl_bilinear_residual_fp16 convnd_fwd_xdl_bilinear_residual_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_fwd_xdl_bilinear_residual_fp16)
-      add_example_executable(example_convnd_bwd_data_xdl_bilinear_residual_fp16 convnd_bwd_data_xdl_bilinear_residual_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_bwd_data_xdl_bilinear_residual_fp16)
-      add_example_executable(example_convnd_bwd_weight_xdl_bilinear_residual_fp16 convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_bwd_weight_xdl_bilinear_residual_fp16)
-      set(target 1)
- endif()
-endforeach()
+add_custom_target(example_convnd_activ_binary_xdl)
+# Bilinear residual
+add_example_executable(example_convnd_fwd_xdl_bilinear_residual_fp16 convnd_fwd_xdl_bilinear_residual_fp16.cpp)
+add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_fwd_xdl_bilinear_residual_fp16)
+add_example_executable(example_convnd_bwd_data_xdl_bilinear_residual_fp16 convnd_bwd_data_xdl_bilinear_residual_fp16.cpp)
+add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_bwd_data_xdl_bilinear_residual_fp16)
+add_example_executable(example_convnd_bwd_weight_xdl_bilinear_residual_fp16 convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp)
+add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_bwd_weight_xdl_bilinear_residual_fp16)
+
diff --git a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
index f5bddf2302..2710dd6b63 100644
--- a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
+++ b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -70,10 +70,10 @@ using DeviceGroupedConvNDBwdDataInstance =
         32,          // KPerBlock
         8,           // AK1
         2,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -91,7 +91,7 @@ using DeviceGroupedConvNDBwdDataInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8>;
+        4>;
 
 using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDBwdDataInstance<OutElementOp>;
 
diff --git a/example/62_convnd_activ/binary/convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp b/example/62_convnd_activ/binary/convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp
index fa3edc5adc..cb37ebf575 100644
--- a/example/62_convnd_activ/binary/convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp
+++ b/example/62_convnd_activ/binary/convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -63,10 +63,10 @@ using DeviceGroupedConvNDBwdWeightInstance =
         128,                    // NPerBlock
         4,                      // K0PerBlock
         8,                      // K1
-        32,                     // MPerXdl
-        32,                     // NPerXdl
-        2,                      // MXdlPerWave
-        2,                      // NXdlPerWave
+        16,                     // MPerXdl
+        16,                     // NPerXdl
+        4,                      // MXdlPerWave
+        4,                      // NXdlPerWave
         S<1, 4, 16, 4>,         // ABlockTransferThreadClusterLengths_K0_M_K1
         S<0, 3, 1, 2>,          // ABlockTransferThreadClusterArrangeOrder
         S<0, 2, 1, 3>,          // ABlockTransferSrcAccessOrder
@@ -84,7 +84,7 @@ using DeviceGroupedConvNDBwdWeightInstance =
         1,                      // CShuffleMXdlPerWavePerShuffle
         1,                      // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 4>,         // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
+        64 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
 using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDBwdWeightInstance<WeiElementOp>;
 
 namespace {
@@ -257,4 +257,12 @@ bool run_grouped_conv(bool do_verification,
 
 #include "../run_convnd_activ_example.inc"
 
-int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    // temp disable test on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return !run_convnd_example(argc, argv);
+}
diff --git a/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp b/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp
index ae1ebcb2cd..616d0cc9e8 100644
--- a/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp
+++ b/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -71,10 +71,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8>;
+        4>;
 
 using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
 
diff --git a/example/62_convnd_activ/convinvscale/CMakeLists.txt b/example/62_convnd_activ/convinvscale/CMakeLists.txt
index 7aae090674..c737bc00ec 100644
--- a/example/62_convnd_activ/convinvscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convinvscale/CMakeLists.txt
@@ -1,10 +1,5 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-      add_custom_target(example_convnd_activ_xdl_convinvscale)
-      add_example_executable(example_convnd_fwd_xdl_convinvscale_fp8 convnd_fwd_xdl_convinvscale_fp8.cpp)
-      add_example_dependencies(example_convnd_activ_xdl_convinvscale example_convnd_fwd_xdl_convinvscale_fp8)
-      set(target 1)
- endif()
-endforeach()
\ No newline at end of file
+if (NOT GPU_TARGETS MATCHES "gfx11")
+    add_custom_target(example_convnd_activ_xdl_convinvscale)
+    add_example_executable(example_convnd_fwd_xdl_convinvscale_fp8 convnd_fwd_xdl_convinvscale_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_xdl_convinvscale example_convnd_fwd_xdl_convinvscale_fp8)
+endif()
\ No newline at end of file
diff --git a/example/62_convnd_activ/convinvscale/convnd_fwd_xdl_convinvscale_fp8.cpp b/example/62_convnd_activ/convinvscale/convnd_fwd_xdl_convinvscale_fp8.cpp
index fbdfc72063..2194c536c0 100644
--- a/example/62_convnd_activ/convinvscale/convnd_fwd_xdl_convinvscale_fp8.cpp
+++ b/example/62_convnd_activ/convinvscale/convnd_fwd_xdl_convinvscale_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_convinvscale_common.hpp"
 
@@ -58,10 +58,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -79,7 +79,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeDataType,
         BComputeDataType>;
 
diff --git a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
index d101fd59bd..0a802ee27d 100644
--- a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
+++ b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
@@ -74,10 +74,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -95,7 +95,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8>;
+        4>;
 
 using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
 
@@ -130,11 +130,12 @@ bool run_grouped_conv(bool do_verification,
     // Fill other lenghts than G,K with 1 and strides with 0
     bias_g_k_lengths.fill(1);
     bias_g_k_strides.fill(0);
-    bias_g_k_lengths[0]              = G;
-    bias_g_k_lengths[2]              = K;
-    bias_g_k_strides[0]              = K; // stride to G
-    bias_g_k_strides[2]              = 1; // stride to K
-    const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides);
+    bias_g_k_lengths[0] = G;
+    bias_g_k_lengths[2] = K;
+    bias_g_k_strides[0] = K; // stride to G
+    bias_g_k_strides[2] = 1; // stride to K
+    const auto broadcasted_bias_desc =
+        HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides, BiasLayout{});
 
     //  y = relu ( alpha1 * conv(x) + alpha2 * z + bias )
     Tensor<InDataType> in(in_g_n_c_wis_desc);
diff --git a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
index f784655cc5..3266c55d7c 100644
--- a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
+++ b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -71,10 +71,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8>;
+        4>;
 
 using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
 
diff --git a/example/62_convnd_activ/convscale/CMakeLists.txt b/example/62_convnd_activ/convscale/CMakeLists.txt
index 26f6c1b168..8746a5ad54 100644
--- a/example/62_convnd_activ/convscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale/CMakeLists.txt
@@ -1,20 +1,14 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-     add_custom_target(example_convnd_activ_xdl_convscale)
-     add_example_executable(example_convnd_fwd_xdl_convscale_fp8 convnd_fwd_xdl_convscale_fp8.cpp)
-     add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_fp8 )
+if (NOT GPU_TARGETS MATCHES "gfx11")
+    add_custom_target(example_convnd_activ_xdl_convscale)
+    add_example_executable(example_convnd_fwd_xdl_convscale_fp8 convnd_fwd_xdl_convscale_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_fp8 )
 
-     add_example_executable(example_convnd_fwd_xdl_convscale_bf8 convnd_fwd_xdl_convscale_bf8.cpp)
-     add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_bf8)
+    add_example_executable(example_convnd_fwd_xdl_convscale_bf8 convnd_fwd_xdl_convscale_bf8.cpp)
+    add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_bf8)
 
-     add_example_executable(example_convnd_fwd_xdl_convscale_fp8_bf8 convnd_fwd_xdl_convscale_fp8_bf8.cpp)
-     add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_fp8_bf8)
+    add_example_executable(example_convnd_fwd_xdl_convscale_fp8_bf8 convnd_fwd_xdl_convscale_fp8_bf8.cpp)
+    add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_fp8_bf8)
 
-     add_example_executable(example_convnd_fwd_xdl_convscale_bf8_fp8 convnd_fwd_xdl_convscale_bf8_fp8.cpp)
-     add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_bf8_fp8)
-
-     set(target 1)
- endif()
-endforeach()
+    add_example_executable(example_convnd_fwd_xdl_convscale_bf8_fp8 convnd_fwd_xdl_convscale_bf8_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_bf8_fp8)
+endif()
diff --git a/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_bf8.cpp b/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_bf8.cpp
index c1c8c3a57f..f7ad53221c 100644
--- a/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_bf8.cpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_bf8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_convscale_common.hpp"
 
@@ -58,10 +58,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -79,7 +79,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeDataType,
         BComputeDataType>;
 
diff --git a/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_bf8_fp8.cpp b/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_bf8_fp8.cpp
index 8590d0620f..6f0337b85e 100644
--- a/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_bf8_fp8.cpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_bf8_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_convscale_common.hpp"
 
@@ -58,10 +58,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -79,7 +79,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeDataType,
         BComputeDataType>;
 
diff --git a/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8.cpp b/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8.cpp
index a7d69ccffc..7046c93f9f 100644
--- a/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8.cpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_convscale_common.hpp"
 
@@ -58,10 +58,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -79,7 +79,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeDataType,
         BComputeDataType>;
 
diff --git a/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8_bf8.cpp b/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8_bf8.cpp
index ab59e08a80..3376b9aba3 100644
--- a/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8_bf8.cpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8_bf8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_convscale_common.hpp"
 
@@ -58,10 +58,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -79,7 +79,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeDataType,
         BComputeDataType>;
 
diff --git a/example/62_convnd_activ/convscale_add/CMakeLists.txt b/example/62_convnd_activ/convscale_add/CMakeLists.txt
index b2e0eecb58..5dac630298 100644
--- a/example/62_convnd_activ/convscale_add/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_add/CMakeLists.txt
@@ -1,11 +1,5 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-     add_custom_target(example_convnd_activ_xdl_convscale_add)
-     add_example_executable(example_convnd_fwd_xdl_convscale_add_fp8 convnd_fwd_xdl_convscale_add_fp8.cpp)
-     add_example_dependencies(example_convnd_activ_xdl_convscale_add example_convnd_fwd_xdl_convscale_add_fp8 )
-
-     set(target 1)
- endif()
-endforeach()
+if (NOT GPU_TARGETS MATCHES "gfx11")
+    add_custom_target(example_convnd_activ_xdl_convscale_add)
+    add_example_executable(example_convnd_fwd_xdl_convscale_add_fp8 convnd_fwd_xdl_convscale_add_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_xdl_convscale_add example_convnd_fwd_xdl_convscale_add_fp8)
+endif()
\ No newline at end of file
diff --git a/example/62_convnd_activ/convscale_add/convnd_fwd_xdl_convscale_add_fp8.cpp b/example/62_convnd_activ/convscale_add/convnd_fwd_xdl_convscale_add_fp8.cpp
index 3f592b2c54..71dddcfe91 100644
--- a/example/62_convnd_activ/convscale_add/convnd_fwd_xdl_convscale_add_fp8.cpp
+++ b/example/62_convnd_activ/convscale_add/convnd_fwd_xdl_convscale_add_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/tuple.hpp"
 #include "convnd_fwd_convscale_add_common.hpp"
@@ -57,10 +57,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -78,7 +78,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeDataType,
         BComputeDataType>;
 
diff --git a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
index 739c855ae4..c1c64671b4 100644
--- a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
@@ -1,14 +1,8 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-     add_custom_target(example_convnd_activ_xdl_convscale_reduce)
-     add_example_executable(example_convnd_fwd_xdl_convscale_relu_amax_fp8 convnd_fwd_xdl_convscale_relu_amax_fp8.cpp)
-     add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8)
+if (NOT GPU_TARGETS MATCHES "gfx11")
+    add_custom_target(example_convnd_activ_xdl_convscale_reduce)
+    add_example_executable(example_convnd_fwd_xdl_convscale_relu_amax_fp8 convnd_fwd_xdl_convscale_relu_amax_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8)
 
-     add_example_executable(example_convnd_fwd_xdl_convscale_amax_fp8 convnd_fwd_xdl_convscale_amax_fp8.cpp)
-     add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_amax_fp8)
-
-     set(target 1)
- endif()
-endforeach()
+    add_example_executable(example_convnd_fwd_xdl_convscale_amax_fp8 convnd_fwd_xdl_convscale_amax_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_amax_fp8)
+endif()
\ No newline at end of file
diff --git a/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
index a8b4fdbead..7f0b2329f6 100644
--- a/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
+++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_convscale_reduce_common.hpp"
 
@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,7 +73,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeDataType,
         BComputeDataType>;
 
diff --git a/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
index df6bf7bd5c..9a7de75d00 100644
--- a/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
+++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_convscale_reduce_common.hpp"
 
@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,7 +73,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeDataType,
         BComputeDataType>;
 
diff --git a/example/62_convnd_activ/convscale_relu/CMakeLists.txt b/example/62_convnd_activ/convscale_relu/CMakeLists.txt
index c3241aecf2..024b79e2af 100644
--- a/example/62_convnd_activ/convscale_relu/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_relu/CMakeLists.txt
@@ -1,11 +1,5 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-     add_custom_target(example_convnd_activ_xdl_convscale_relu)
-     add_example_executable(example_convnd_fwd_xdl_convscale_relu_fp8 convnd_fwd_xdl_convscale_relu_fp8.cpp)
-     add_example_dependencies(example_convnd_activ_xdl_convscale_relu example_convnd_fwd_xdl_convscale_relu_fp8 )
-
-     set(target 1)
- endif()
-endforeach()
+if (NOT GPU_TARGETS MATCHES "gfx11")
+    add_custom_target(example_convnd_activ_xdl_convscale_relu)
+    add_example_executable(example_convnd_fwd_xdl_convscale_relu_fp8 convnd_fwd_xdl_convscale_relu_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_xdl_convscale_relu example_convnd_fwd_xdl_convscale_relu_fp8)
+endif()
diff --git a/example/62_convnd_activ/convscale_relu/convnd_fwd_xdl_convscale_relu_fp8.cpp b/example/62_convnd_activ/convscale_relu/convnd_fwd_xdl_convscale_relu_fp8.cpp
index 360349e7ec..4fac49133c 100644
--- a/example/62_convnd_activ/convscale_relu/convnd_fwd_xdl_convscale_relu_fp8.cpp
+++ b/example/62_convnd_activ/convscale_relu/convnd_fwd_xdl_convscale_relu_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_convscale_relu_common.hpp"
 
@@ -56,10 +56,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -77,7 +77,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8,
+        4,
         AComputeDataType,
         BComputeDataType>;
 
diff --git a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
index 8441030945..359b444dd0 100644
--- a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
+++ b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
@@ -1,45 +1,37 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-      add_custom_target(example_convnd_activ_dynamic_unary_xdl)
-      # Sigmoid
-      add_example_executable(example_convnd_fwd_xdl_dynamic_sigmoid_fp16 convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_sigmoid_fp16)
-      # Tanh
-      add_example_executable(example_convnd_fwd_xdl_dynamic_tanh_fp16 convnd_fwd_xdl_dynamic_tanh_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_tanh_fp16)
-      # Relu
-      add_example_executable(example_convnd_fwd_xdl_dynamic_relu_fp16 convnd_fwd_xdl_dynamic_relu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_relu_fp16)
-      # SoftRelu
-      add_example_executable(example_convnd_fwd_xdl_dynamic_softrelu_fp16 convnd_fwd_xdl_dynamic_softrelu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_softrelu_fp16)
-      # Abs
-      add_example_executable(example_convnd_fwd_xdl_dynamic_abs_fp16 convnd_fwd_xdl_dynamic_abs_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_abs_fp16)
-      # Pow
-      add_example_executable(example_convnd_fwd_xdl_dynamic_pow_fp16 convnd_fwd_xdl_dynamic_pow_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_pow_fp16)
-      # Clipped Relu
-      add_example_executable(example_convnd_fwd_xdl_dynamic_clippedrelu_fp16 convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_clippedrelu_fp16)
-      # Leaky Relu
-      add_example_executable(example_convnd_fwd_xdl_dynamic_leakyrelu_fp16 convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_leakyrelu_fp16)
-      # Elu
-      add_example_executable(example_convnd_fwd_xdl_dynamic_elu_fp16 convnd_fwd_xdl_dynamic_elu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_elu_fp16)
-      # Swish
-      add_example_executable(example_convnd_fwd_xdl_dynamic_swish_fp16 convnd_fwd_xdl_dynamic_swish_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_swish_fp16)
-      # PassThrough
-      add_example_executable(example_convnd_fwd_xdl_dynamic_passthrough_fp16 convnd_fwd_xdl_dynamic_passthrough_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_passthrough_fp16)
-      # Logistic
-      add_example_executable(example_convnd_fwd_xdl_dynamic_logistic_fp16 convnd_fwd_xdl_dynamic_logistic_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_logistic_fp16)
-   
-   set(target 1)
- endif()
-endforeach()
+add_custom_target(example_convnd_activ_dynamic_unary_xdl)
+# Sigmoid
+add_example_executable(example_convnd_fwd_xdl_dynamic_sigmoid_fp16 convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_sigmoid_fp16)
+# Tanh
+add_example_executable(example_convnd_fwd_xdl_dynamic_tanh_fp16 convnd_fwd_xdl_dynamic_tanh_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_tanh_fp16)
+# Relu
+add_example_executable(example_convnd_fwd_xdl_dynamic_relu_fp16 convnd_fwd_xdl_dynamic_relu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_relu_fp16)
+# SoftRelu
+add_example_executable(example_convnd_fwd_xdl_dynamic_softrelu_fp16 convnd_fwd_xdl_dynamic_softrelu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_softrelu_fp16)
+# Abs
+add_example_executable(example_convnd_fwd_xdl_dynamic_abs_fp16 convnd_fwd_xdl_dynamic_abs_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_abs_fp16)
+# Pow
+add_example_executable(example_convnd_fwd_xdl_dynamic_pow_fp16 convnd_fwd_xdl_dynamic_pow_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_pow_fp16)
+# Clipped Relu
+add_example_executable(example_convnd_fwd_xdl_dynamic_clippedrelu_fp16 convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_clippedrelu_fp16)
+# Leaky Relu
+add_example_executable(example_convnd_fwd_xdl_dynamic_leakyrelu_fp16 convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_leakyrelu_fp16)
+# Elu
+add_example_executable(example_convnd_fwd_xdl_dynamic_elu_fp16 convnd_fwd_xdl_dynamic_elu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_elu_fp16)
+# Swish
+add_example_executable(example_convnd_fwd_xdl_dynamic_swish_fp16 convnd_fwd_xdl_dynamic_swish_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_swish_fp16)
+# PassThrough
+add_example_executable(example_convnd_fwd_xdl_dynamic_passthrough_fp16 convnd_fwd_xdl_dynamic_passthrough_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_passthrough_fp16)
+# Logistic
+add_example_executable(example_convnd_fwd_xdl_dynamic_logistic_fp16 convnd_fwd_xdl_dynamic_logistic_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_logistic_fp16)
\ No newline at end of file
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
index ed31be19ee..4af7f4535a 100644
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -71,10 +71,10 @@ using DeviceGroupedConvNDActivInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceGroupedConvNDActivInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8>;
+        4>;
 
 template <ck::index_t NDimSpatial,
           typename InDataType,
diff --git a/example/62_convnd_activ/multi_AB/CMakeLists.txt b/example/62_convnd_activ/multi_AB/CMakeLists.txt
index 149bd6f03e..80a3a8f196 100644
--- a/example/62_convnd_activ/multi_AB/CMakeLists.txt
+++ b/example/62_convnd_activ/multi_AB/CMakeLists.txt
@@ -1,17 +1,10 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-      add_custom_target(example_convnd_activ_multi_ab_xdl)
-      # ScaleAdd on A and B
-      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp16 conv_fwd_xdl_scaleadd_ab_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_fp16)
-      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp32 conv_fwd_xdl_scaleadd_ab_fp32.cpp)
-      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_fp32)
-      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_bf16 conv_fwd_xdl_scaleadd_ab_bf16.cpp)
-      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_bf16)
-      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_int8 conv_fwd_xdl_scaleadd_ab_int8.cpp)
-      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_int8)
-   set(target 1)
- endif()
-endforeach()
+add_custom_target(example_convnd_activ_multi_ab_xdl)
+# ScaleAdd on A and B
+add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp16 conv_fwd_xdl_scaleadd_ab_fp16.cpp)
+add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_fp16)
+add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp32 conv_fwd_xdl_scaleadd_ab_fp32.cpp)
+add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_fp32)
+add_example_executable(example_conv_fwd_xdl_scaleadd_ab_bf16 conv_fwd_xdl_scaleadd_ab_bf16.cpp)
+add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_bf16)
+add_example_executable(example_conv_fwd_xdl_scaleadd_ab_int8 conv_fwd_xdl_scaleadd_ab_int8.cpp)
+add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_int8)
diff --git a/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp b/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
index bef9980b3e..f63add8cfd 100644
--- a/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
+++ b/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_activ_multi_ab_common.hpp"
 
@@ -23,4 +23,14 @@ using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDMultiABFwdInstance<D
 
 #include "../run_convnd_activ_example.inc"
 
-int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        std::cout << "FP32 are not supported on gfx11 and gfx12" << std::endl;
+        return 0;
+    }
+
+    return !run_convnd_example(argc, argv);
+}
diff --git a/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp b/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
index 2626843ed4..566aa50d23 100644
--- a/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
+++ b/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -68,10 +68,10 @@ using DeviceGroupedConvNDMultiABFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -89,7 +89,7 @@ using DeviceGroupedConvNDMultiABFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8>;
+        4>;
 
 namespace {
 template <ck::index_t NDimSpatial,
diff --git a/example/62_convnd_activ/unary/CMakeLists.txt b/example/62_convnd_activ/unary/CMakeLists.txt
index 36b4ffc9f4..2b54b1f590 100644
--- a/example/62_convnd_activ/unary/CMakeLists.txt
+++ b/example/62_convnd_activ/unary/CMakeLists.txt
@@ -1,45 +1,37 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-      add_custom_target(example_convnd_activ_unary_xdl)
-      # Sigmoid
-      add_example_executable(example_convnd_fwd_xdl_sigmoid_fp16 convnd_fwd_xdl_sigmoid_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_sigmoid_fp16)
-      # Tanh
-      add_example_executable(example_convnd_fwd_xdl_tanh_fp16 convnd_fwd_xdl_tanh_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_tanh_fp16)
-      # Relu
-      add_example_executable(example_convnd_fwd_xdl_relu_fp16 convnd_fwd_xdl_relu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_relu_fp16)
-      # SoftRelu
-      add_example_executable(example_convnd_fwd_xdl_softrelu_fp16 convnd_fwd_xdl_softrelu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_softrelu_fp16)
-      # Abs
-      add_example_executable(example_convnd_fwd_xdl_abs_fp16 convnd_fwd_xdl_abs_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_abs_fp16)
-      # Pow
-      add_example_executable(example_convnd_fwd_xdl_pow_fp16 convnd_fwd_xdl_pow_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_pow_fp16)
-      # Clipped Relu
-      add_example_executable(example_convnd_fwd_xdl_clippedrelu_fp16 convnd_fwd_xdl_clippedrelu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_clippedrelu_fp16)
-      # Leaky Relu
-      add_example_executable(example_convnd_fwd_xdl_leakyrelu_fp16 convnd_fwd_xdl_leakyrelu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_leakyrelu_fp16)
-      # Elu
-      add_example_executable(example_convnd_fwd_xdl_elu_fp16 convnd_fwd_xdl_elu_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_elu_fp16)
-      # Swish
-      add_example_executable(example_convnd_fwd_xdl_swish_fp16 convnd_fwd_xdl_swish_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_swish_fp16)
-      # PassThrough
-      add_example_executable(example_convnd_fwd_xdl_passthrough_fp16 convnd_fwd_xdl_passthrough_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_passthrough_fp16)
-      # Logistic
-      add_example_executable(example_convnd_fwd_xdl_logistic_fp16 convnd_fwd_xdl_logistic_fp16.cpp)
-      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_logistic_fp16)
-   
-   set(target 1)
- endif()
-endforeach()
+add_custom_target(example_convnd_activ_unary_xdl)
+# Sigmoid
+add_example_executable(example_convnd_fwd_xdl_sigmoid_fp16 convnd_fwd_xdl_sigmoid_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_sigmoid_fp16)
+# Tanh
+add_example_executable(example_convnd_fwd_xdl_tanh_fp16 convnd_fwd_xdl_tanh_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_tanh_fp16)
+# Relu
+add_example_executable(example_convnd_fwd_xdl_relu_fp16 convnd_fwd_xdl_relu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_relu_fp16)
+# SoftRelu
+add_example_executable(example_convnd_fwd_xdl_softrelu_fp16 convnd_fwd_xdl_softrelu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_softrelu_fp16)
+# Abs
+add_example_executable(example_convnd_fwd_xdl_abs_fp16 convnd_fwd_xdl_abs_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_abs_fp16)
+# Pow
+add_example_executable(example_convnd_fwd_xdl_pow_fp16 convnd_fwd_xdl_pow_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_pow_fp16)
+# Clipped Relu
+add_example_executable(example_convnd_fwd_xdl_clippedrelu_fp16 convnd_fwd_xdl_clippedrelu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_clippedrelu_fp16)
+# Leaky Relu
+add_example_executable(example_convnd_fwd_xdl_leakyrelu_fp16 convnd_fwd_xdl_leakyrelu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_leakyrelu_fp16)
+# Elu
+add_example_executable(example_convnd_fwd_xdl_elu_fp16 convnd_fwd_xdl_elu_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_elu_fp16)
+# Swish
+add_example_executable(example_convnd_fwd_xdl_swish_fp16 convnd_fwd_xdl_swish_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_swish_fp16)
+# PassThrough
+add_example_executable(example_convnd_fwd_xdl_passthrough_fp16 convnd_fwd_xdl_passthrough_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_passthrough_fp16)
+# Logistic
+add_example_executable(example_convnd_fwd_xdl_logistic_fp16 convnd_fwd_xdl_logistic_fp16.cpp)
+add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_logistic_fp16)
diff --git a/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp b/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp
index 4669465bf4..dd171d9ed3 100644
--- a/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -71,10 +71,10 @@ using DeviceGroupedConvNDFwdInstance =
         32,          // KPerBlock
         8,           // AK1
         8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceGroupedConvNDFwdInstance =
         1,
         1,
         S<1, 32, 1, 8>,
-        8>;
+        4>;
 
 template <ck::index_t NDimSpatial,
           typename InDataType,
diff --git a/example/64_fpAintB_gemm/run_gemm_example.inc b/example/64_fpAintB_gemm/run_gemm_example.inc
index dc2bdc18f0..41c8c42bac 100644
--- a/example/64_fpAintB_gemm/run_gemm_example.inc
+++ b/example/64_fpAintB_gemm/run_gemm_example.inc
@@ -28,7 +28,8 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<QuantDataType> quant_b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     // assume scale tensor is [1, n]
-    Tensor<ScaleDataType> scale_k_n(f_host_tensor_descriptor(K, N, 0, Row{}));
+    Tensor<ScaleDataType> scale_k_n(
+        HostTensorDescriptor({K, N}, {0, 1_uz}, ck::tensor_layout::BypassLayoutVerification()));
 
     switch(config.init_method)
     {
diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index d1e1a51afd..74930d2b21 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -16,7 +16,7 @@ add_example_executable(example_moe_gemm2_xdl_fp8 moe_gemm2_xdl_fp8.cpp)
 add_example_executable(example_moe_gemm2_xdl_fp8_blockscale moe_gemm2_xdl_fp8_blockscale.cpp)
 add_example_executable(example_moe_gemm1_xdl_fp8_blockscale moe_gemm1_xdl_fp8_blockscale.cpp)
 
-list(APPEND gpu_list gfx942 gfx950)
+list(APPEND gpu_list gfx942 gfx950 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx11-generic gfx12-generic)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
@@ -70,3 +70,5 @@ example_compile_options(example_gemm_multiply_multiply_xdl_fp8_blockscale_bpresh
 
 example_compile_options(example_moe_gemm2_xdl_fp8_blockscale PRIVATE ${BLOCKSCALE_GEMM_OPTIONS})
 example_compile_options(example_moe_gemm1_xdl_fp8_blockscale PRIVATE ${BLOCKSCALE_GEMM_OPTIONS})
+
+add_example_executable(example_gemm_add_add_wmma_fp16 gemm_add_add_wmma_fp16.cpp)
diff --git a/example/65_gemm_multiply_multiply/gemm_add_add_wmma_fp16.cpp b/example/65_gemm_multiply_multiply/gemm_add_add_wmma_fp16.cpp
new file mode 100644
index 0000000000..54abab2f60
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/gemm_add_add_wmma_fp16.cpp
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F16;
+using B0DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = F16;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+struct AddAdd
+{
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, float, float, float>(
+        ck::half_t& e, const float& c, const float& d0, const float& d1) const
+    {
+        const float x0_f = c + d0 + d1;
+
+        e = ck::type_convert<ck::half_t>(x0_f);
+    }
+};
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAdd;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffleV3
+    // clang-format off
+    //#########################|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|        CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|                                    BlkGemm|                          BlkGemm|
+    //#########################|         |         |         |        |       Type|       Type|       Type|      Type|        Type|        DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|                                  PipeSched|                      PipelineVer|
+    //#########################|         |         |         |        |           |           |           |          |            |                |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |                                           |                                 |
+    //#########################|         |         |         |        |           |           |           |          |            |                |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|               S<C, D..>|                                           |                                 |
+                              <  A0Layout, B0Layout, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp,  BElementOp, CDEElementOp, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,       S<1, 32, 1, 4>,               S<8, 8, 8>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideD = K;
+    ck::index_t StrideE = N;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{0, 2});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{0, 2});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{0, 2});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, NumDTensor>{StrideD, StrideD},
+                               StrideE,
+                               1,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 20, 50});
+
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N +
+                            sizeof(D0DataType) * M * N + sizeof(D1DataType) * M * N +
+                            sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B0DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp b/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp
index 580f38a79f..fe8fd9c100 100644
--- a/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -78,11 +78,17 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
 ///######|         |         |         |        |           |           |           |          |            |                 |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 ///######|         |         |         |        |           |           |           |          |            |                 |            |            |             |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |    S<C, D0, D1>|
 ///###### RCR
-         <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   256,   128,   128,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,               8,          0,     S<4, 64, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,               8,             8,          0,          1,           1,               S<1, 32, 1, 8>,      S<8, 8, 8>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, FP8>;
+         <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   256,   128,   128,  16,  16,  16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,               8,          0,     S<4, 64, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,               8,             8,          0,          1,           1,               S<1, 32, 1, 8>,      S<4, 4, 4>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, FP8>;
 // clang-format on
 
 int main(int argc, char* argv[])
 {
+    // fp8 are not supported on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+
     bool do_verification = true;
     int init_method      = 1;
     bool time_kernel     = false;
@@ -184,7 +190,6 @@ int main(int argc, char* argv[])
     b0_device_buf.ToDevice(b0_k_n.mData.data());
     d0_device_buf.ToDevice(d0_m_n.mData.data());
     d1_device_buf.ToDevice(d1_m_n.mData.data());
-    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
 
     auto a_element_op   = AElementOp{};
     auto b_element_op   = BElementOp{};
@@ -220,11 +225,12 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 20, 50});
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 20, 50});
 
-    std::size_t flop = std::size_t(2) * M * N * K;
-    std::size_t num_btype =
-        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N +
+                            sizeof(D0DataType) * M * N + sizeof(D1DataType) * M * N +
+                            sizeof(EDataType) * M * N;
 
     float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
 
@@ -233,8 +239,6 @@ int main(int argc, char* argv[])
     std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
               << std::endl;
 
-    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
-
     if(do_verification)
     {
         Tensor<CShuffleDataType> c_m_n({M, N});
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp
index 69803c7eeb..8b8cee9e52 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp
@@ -97,11 +97,12 @@ struct MultiplyMultiply
     }
 };
 
+static constexpr int KPack = 8;
+
 void preShuffleBuffer(const F16* src, F16* dst, int N, int K, int NXdl)
 {
-    int KPack = 16 / sizeof(F16);
     int NLane = NXdl;
-    int KLane = 64 / NLane;
+    int KLane = ck::get_warp_size() / NLane;
 
     int K0 = K / (KLane * KPack);
     // K -> K0 KLane KPack
@@ -147,12 +148,12 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
         <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
                AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,
                32,   128,    128,
-               8,   8,
-               32,   32,
-               1,    1,
+               KPack,   KPack,
+               16,   16,
+               2,    2,
                S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
                S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
-               1,    1,   S<1, 16, 1, 16>, S<8, 8, 1>,
+               1,    1,   S<1, 16, 1, 16>, S<4, 4, 1>,
                ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, F16>;
 // clang-format on
 
@@ -211,6 +212,12 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+
     auto f_host_tensor_descriptor =
         [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
             using namespace ck::literals;
@@ -234,6 +241,28 @@ int main(int argc, char* argv[])
     Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
     Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
 
+    // Update strides based on tensor properties if they are <= 0
+    auto get_stride = [](auto& tensor, auto layout, ck::index_t current_stride) -> ck::index_t {
+        if(current_stride <= 0)
+        {
+            if constexpr(std::is_same_v<decltype(layout), Row>)
+            {
+                return tensor.GetStrides()[0];
+            }
+            else
+            {
+                return tensor.GetStrides()[1];
+            }
+        }
+        return current_stride;
+    };
+
+    StrideA              = get_stride(a0_m_k, A0Layout{}, StrideA);
+    StrideB              = get_stride(b0_k_n, B0Layout{}, StrideB);
+    ck::index_t StrideD0 = get_stride(d0_m_n, D0Layout{}, StrideD);
+    ck::index_t StrideD1 = get_stride(d1_m_n, D1Layout{}, StrideD);
+    StrideE              = get_stride(e_m_n_host_result, ELayout{}, StrideE);
+
     std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
     std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
     std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
@@ -278,8 +307,6 @@ int main(int argc, char* argv[])
 
     constexpr ck::index_t NumDTensor = DsDataType::Size();
 
-    constexpr auto I0 = ck::Number<0>{};
-
     // do GEMM
     auto device_op = DeviceOpInstance{};
 
@@ -301,7 +328,7 @@ int main(int argc, char* argv[])
                                K,
                                StrideA,
                                StrideB,
-                               std::array<ck::index_t, NumDTensor>{I0, I0},
+                               std::array<ck::index_t, NumDTensor>{StrideD0, StrideD1},
                                StrideE,
                                KBatch,
                                a_element_op,
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
index 352d373ae5..8da49ef85d 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -162,6 +162,28 @@ int main(int argc, char* argv[])
     Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
     Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
 
+    // Update strides based on tensor properties if they are <= 0
+    auto get_stride = [](auto& tensor, auto layout, ck::index_t current_stride) -> ck::index_t {
+        if(current_stride <= 0)
+        {
+            if constexpr(std::is_same_v<decltype(layout), Row>)
+            {
+                return tensor.GetStrides()[0];
+            }
+            else
+            {
+                return tensor.GetStrides()[1];
+            }
+        }
+        return current_stride;
+    };
+
+    StrideA              = get_stride(a0_m_k, A0Layout{}, StrideA);
+    StrideB              = get_stride(b0_k_n, B0Layout{}, StrideB);
+    ck::index_t StrideD0 = get_stride(d0_m_n, D0Layout{}, StrideD);
+    ck::index_t StrideD1 = get_stride(d1_m_n, D1Layout{}, StrideD);
+    StrideE              = get_stride(e_m_n_host_result, ELayout{}, StrideE);
+
     std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
     std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
     std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
@@ -202,8 +224,6 @@ int main(int argc, char* argv[])
 
     constexpr ck::index_t NumDTensor = DsDataType::Size();
 
-    constexpr auto I0 = ck::Number<0>{};
-
     // do GEMM
     auto device_op = DeviceOpInstance{};
     auto invoker   = device_op.MakeInvoker();
@@ -218,7 +238,7 @@ int main(int argc, char* argv[])
                                K,
                                StrideA,
                                StrideB,
-                               std::array<ck::index_t, NumDTensor>{I0, I0},
+                               std::array<ck::index_t, NumDTensor>{StrideD0, StrideD1},
                                StrideE,
                                KBatch,
                                a_element_op,
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index fe1eca51b0..3ee4955ae4 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -251,6 +251,28 @@ int main(int argc, char* argv[])
     Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
     Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
 
+    // Update strides based on tensor properties if they are <= 0
+    auto get_stride = [](auto& tensor, auto layout, ck::index_t current_stride) -> ck::index_t {
+        if(current_stride <= 0)
+        {
+            if constexpr(std::is_same_v<decltype(layout), Row>)
+            {
+                return tensor.GetStrides()[0];
+            }
+            else
+            {
+                return tensor.GetStrides()[1];
+            }
+        }
+        return current_stride;
+    };
+
+    StrideA              = get_stride(a0_m_k, A0Layout{}, StrideA);
+    StrideB              = get_stride(b0_k_n, B0Layout{}, StrideB);
+    ck::index_t StrideD0 = get_stride(d0_m_n, D0Layout{}, StrideD);
+    ck::index_t StrideD1 = get_stride(d1_m_n, D1Layout{}, StrideD);
+    StrideE              = get_stride(e_m_n_host_result, ELayout{}, StrideE);
+
     std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
     std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
     std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
@@ -295,8 +317,6 @@ int main(int argc, char* argv[])
 
     constexpr ck::index_t NumDTensor = DsDataType::Size();
 
-    constexpr auto I0 = ck::Number<0>{};
-
     // do GEMM
     auto device_op = DeviceOpInstance{};
 
@@ -318,7 +338,7 @@ int main(int argc, char* argv[])
                                K,
                                StrideA,
                                StrideB,
-                               std::array<ck::index_t, NumDTensor>{I0, I0},
+                               std::array<ck::index_t, NumDTensor>{StrideD0, StrideD1},
                                StrideE,
                                KBatch,
                                a_element_op,
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp
index cbbd37408e..cc01d01e64 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -125,11 +125,11 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
                 AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,
                 64,   128,   256,
                 16,   16,
-                32,   32,
-                1,    2,
+                16,   16,
+                2,    4,
                 S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                 S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-                1, 1, S<1, 32, 1, 8>, S<8, 8, 1>,
+                1, 1, S<1, 32, 1, 8>, S<4, 4, 1>,
                 ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, I8>;
 // clang-format on
 
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index 9fe9fdde78..72ea7f1cb6 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -168,7 +168,7 @@ static constexpr ck::index_t KPerBlock = 128 / sizeof(A0DataType);
 static constexpr ck::index_t Nswizzle  = false;
 static constexpr ck::index_t AK1       = 16 / sizeof(A0DataType);
 static constexpr ck::index_t BK1       = 16 / sizeof(B0DataType);
-static constexpr ck::index_t EVec      = 16 / sizeof(EDataType);
+static constexpr ck::index_t EVec      = 8 / sizeof(EDataType);
 static constexpr ck::index_t D0Vec     = 1;
 static constexpr ck::index_t D1Vec     = 1;
 static constexpr ck::index_t ActOP     = 1; // 0: gelu_and_mul, 1: silu_and_mul
@@ -287,15 +287,18 @@ int main(int argc, char* argv[])
         }
     }
     Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_e_n_k(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
+    Tensor<B0DataType> b0_preshuffled(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
     Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
     Tensor<D1DataType> d1_e_n(
         HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
-    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_n_host_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
     Tensor<EDataType> e_t_n_device_result(
-        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
     std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
     std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
     std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
@@ -422,7 +425,7 @@ int main(int argc, char* argv[])
 
         e_device_buf.FromDevice(e_t_n_device_result.mData.data());
 
-        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
 
         using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
                                                                                    B0DataType,
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
index c5328226ff..66627a6de6 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -301,18 +301,22 @@ int main(int argc, char* argv[])
     }
     Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
     Tensor<A1DataType> a1_t_k(HostTensorDescriptor(
-        {tokens, (K + Scale_Block_K - 1) / Scale_Block_K}, {Scale_Stride_AM, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+        {tokens, (K + Scale_Block_K - 1) / Scale_Block_K}, {Scale_Stride_AM, 1}, Row{}));
+    Tensor<B0DataType> b0_e_n_k(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
     Tensor<B1DataType> b1_e_n_k(
         HostTensorDescriptor({experts,
                               (K + Scale_Block_K - 1) / Scale_Block_K,
                               (N + Scale_Block_N - 1) / Scale_Block_N * 2},
-                             {(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+                             {(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN},
+                             Col{}));
+    Tensor<B0DataType> b0_preshuffled(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
-    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_n_host_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
     Tensor<EDataType> e_t_n_device_result(
-        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
     e_t_n_device_result.SetZero();
     std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
     std::cout << "a1_t_k: " << a1_t_k.mDesc << std::endl;
@@ -463,7 +467,7 @@ int main(int argc, char* argv[])
         Tensor<float> b_e_n_k({experts, K, N * 2});
         e_device_buf.FromDevice(e_t_n_device_result.mData.data());
 
-        Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+        Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
 
         // handle scale before ref.
         for(int t = 0; t < tokens; ++t)
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
index f78e6e48a5..5e306ac6dd 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
@@ -121,6 +121,7 @@ struct MulABScaleExpertWeight
 };
 
 static constexpr bool MulRoutedWeight = true;
+static constexpr ck::index_t KPack    = 32;
 
 using CDEElementOp = MulABScaleExpertWeight; // combine MulRoutedWeight = true
 
@@ -129,7 +130,6 @@ using CDEElementOp = MulABScaleExpertWeight; // combine MulRoutedWeight = true
 #if 1
 void preShuffleBuffer(const I4* src, I4* dst, int N, int K, int NXdl)
 {
-    int KPack = 32;
     int NLane = NXdl;
     int KLane = 64 / NLane;
 
@@ -169,18 +169,19 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
 static constexpr ck::index_t MPerBlock = 128;
 static constexpr ck::index_t Nswizzle  = false;
 static constexpr ck::index_t Act_OP    = 1; // 0: gelu_and_mul, 1: silu_and_mul
+
 // clang-format off
 using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm<
             Row, Col, DsLayout, ELayout, 
             A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
             AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
             256,   MPerBlock,   64,    128,
-            16,   32,
+            16,   KPack,
             16,   16,
-            8,    1,
+            4,    2,
             S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
             S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0,
-            2,    1,   S<1, 32, 1, 8>, S<8, 1, 1>,
+            2,    1,   S<1, 32, 1, 8>, S<4, 1, 1>,
             ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Act_OP, Nswizzle, true, MulRoutedWeight, true, ck::index_t, A0DataType>;
 // clang-format on
 
@@ -263,15 +264,18 @@ int main(int argc, char* argv[])
     }
 
     Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_e_n_k(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
+    Tensor<B0DataType> b0_preshuffled(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
     Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
     Tensor<D1DataType> d1_e_n(
         HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
-    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_n_host_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
     Tensor<EDataType> e_t_n_device_result(
-        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
 
     std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
     std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
@@ -458,9 +462,10 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
     {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;
     }
 
     if(time_kernel)
@@ -486,7 +491,7 @@ int main(int argc, char* argv[])
 
         e_device_buf.FromDevice(e_t_n_device_result.mData.data());
 
-        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
 
         using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
                                                                                    B0DataType,
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index 6a3986ea32..a6c5a8914f 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -28,8 +28,9 @@ using F16 = ck::half_t;
 using F8  = ck::f8_t;
 using F32 = float;
 
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
 
 using A0DataType       = F8;
 using B0DataType       = F8;
@@ -278,11 +279,11 @@ int main(int argc, char* argv[])
         }
     }
 
-    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
     Tensor<D0DataType> d0_t_n(
-        HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}));
+        HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}, Bypass{}));
     Tensor<D1DataType> d1_e_n(
         HostTensorDescriptor({experts, N}, {PerTokenQuant ? StrideDs[1] * N : 1, StrideDs[1]}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
index 354957c0d1..cc42c4b815 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
@@ -292,17 +292,19 @@ int main(int argc, char* argv[])
         }
     }
 
-    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
     Tensor<A1DataType> a1_t_k_k(
         HostTensorDescriptor({tokens, topk, (K + Scale_Block_K - 1) / Scale_Block_K},
-                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
+                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1},
+                             Row{}));
 
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
     Tensor<B1DataType> b1_e_n_k(HostTensorDescriptor(
         {experts, (K + Scale_Block_K - 1) / Scale_Block_K, (N + Scale_Block_N - 1) / Scale_Block_N},
-        {(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
+        {(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN},
+        Col{}));
 
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
     Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
index 3745e3d0af..29e758f9d4 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
@@ -29,8 +29,9 @@ using F16 = ck::half_t;
 using F8  = ck::f8_t;
 using F32 = float;
 
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
 
 using A0DataType       = F8;
 using B0DataType       = I4;
@@ -85,11 +86,11 @@ struct MulABScaleExpertWeight
     }
 };
 
-using CDEElementOp = MulABScaleExpertWeight;
+using CDEElementOp         = MulABScaleExpertWeight;
+static constexpr int KPack = 32 / sizeof(B0DataType);
 
 void preShuffleBuffer(const I4* src, I4* dst, int N, int K, int NXdl)
 {
-    int KPack = 32;
     int NLane = NXdl;
     int KLane = 64 / NLane;
 
@@ -135,7 +136,7 @@ static constexpr ck::index_t KPerBlock     = 128 / sizeof(A0DataType);
 static constexpr ck::index_t CShuffleNLane = 32;
 static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
 static constexpr ck::index_t AK1           = 16 / sizeof(A0DataType);
-static constexpr ck::index_t BK1           = 32 / sizeof(B0DataType);
+static constexpr ck::index_t BK1           = KPack;
 static constexpr ck::index_t EVec          = 2;
 static constexpr ck::index_t D0Vec         = 1;
 static constexpr ck::index_t D1Vec         = 1;
@@ -239,10 +240,10 @@ int main(int argc, char* argv[])
             sorted_token_ids.mData[i] = tokens;
         }
     }
-    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
-    Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
+    Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}, Bypass{}));
     Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
@@ -414,9 +415,10 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
     {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;
     }
 
     if(time_kernel)
diff --git a/example/66_complex_contraction_bilinear/common_instances.hpp b/example/66_complex_contraction_bilinear/common_instances.hpp
index 480ca5a0af..ed1c1dc303 100644
--- a/example/66_complex_contraction_bilinear/common_instances.hpp
+++ b/example/66_complex_contraction_bilinear/common_instances.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -37,7 +37,7 @@ using DeviceOpInstanceKK_Generic = ck::tensor_operation::device::
         //#####################################|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
         //#####################################|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
         //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
-        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>;
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               2, ComputeDataType>;
 // clang-format on
 
 template <ck::index_t NumDimM,
diff --git a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
index 82ac0a15e1..b08d12de86 100644
--- a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
+++ b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
@@ -95,25 +95,26 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
         exit(0);
     }
 
+    using DefaultLayout = ck::tensor_layout::gemm::RowMajor;
     // For Real Part of Complex Tensor
-    Tensor<ADataType> a_ms_ks_re(a_ms_ks_lengths, a_ms_ks_strides);
-    Tensor<BDataType> b_ns_ks_re(b_ns_ks_lengths, b_ns_ks_strides);
-    Tensor<EDataType> d_ms_ns_re(d_ms_ns_lengths, d_ms_ns_strides);
+    Tensor<ADataType> a_ms_ks_re(a_ms_ks_lengths, a_ms_ks_strides, DefaultLayout{});
+    Tensor<BDataType> b_ns_ks_re(b_ns_ks_lengths, b_ns_ks_strides, DefaultLayout{});
+    Tensor<EDataType> d_ms_ns_re(d_ms_ns_lengths, d_ms_ns_strides, DefaultLayout{});
 
-    Tensor<EDataType> e_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result_re(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
+    Tensor<EDataType> e_ms_ns_device_result_re(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
 
     // For Imaginary Part of Complex Tensor
-    Tensor<ADataType> a_ms_ks_img(a_ms_ks_lengths, a_ms_ks_strides);
-    Tensor<BDataType> b_ns_ks_img(b_ns_ks_lengths, b_ns_ks_strides);
-    Tensor<EDataType> d_ms_ns_img(d_ms_ns_lengths, d_ms_ns_strides);
+    Tensor<ADataType> a_ms_ks_img(a_ms_ks_lengths, a_ms_ks_strides, DefaultLayout{});
+    Tensor<BDataType> b_ns_ks_img(b_ns_ks_lengths, b_ns_ks_strides, DefaultLayout{});
+    Tensor<EDataType> d_ms_ns_img(d_ms_ns_lengths, d_ms_ns_strides, DefaultLayout{});
 
-    Tensor<EDataType> e_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result_img(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
+    Tensor<EDataType> e_ms_ns_device_result_img(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
 
     // Intermediate E tensor Definition
-    Tensor<EDataType> e_ms_ns_device_result_re1(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<EDataType> e_ms_ns_device_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result_re1(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
+    Tensor<EDataType> e_ms_ns_device_result_img1(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
 
     std::cout << "a_ms_ks_re: " << a_ms_ks_re.mDesc << std::endl;
     std::cout << "b_ns_ks_re: " << b_ns_ks_re.mDesc << std::endl;
@@ -349,8 +350,10 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
     if(do_verification)
     {
         // Real Part Verification
-        Tensor<CShuffleDataType> c_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides);
-        Tensor<CShuffleDataType> c_ms_ns_host_result_re1(e_ms_ns_lengths, e_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result_re(
+            e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
+        Tensor<CShuffleDataType> c_ms_ns_host_result_re1(
+            e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
 
         using ReferenceOpInstance =
             ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
@@ -422,8 +425,10 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
         isRealOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
 
         // Img Part Verification
-        Tensor<CShuffleDataType> c_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
-        Tensor<CShuffleDataType> c_ms_ns_host_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result_img(
+            e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
+        Tensor<CShuffleDataType> c_ms_ns_host_result_img1(
+            e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
 
         auto ref_argument_img = ref_op.MakeArgument(
             a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);
diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp
index aaf0cb3891..69c0d6558f 100644
--- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp
+++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp
@@ -269,10 +269,12 @@ int main(int argc, char* argv[])
     Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
     Tensor<XDataType> a1_t_k(HostTensorDescriptor(
         {tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_e_n_k(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
     Tensor<XDataType> b1_e_n_k(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
-                             {(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
+                             {(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN},
+                             Col{}));
 
     // A, B Scale preshuffle
     Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
@@ -281,12 +283,13 @@ int main(int argc, char* argv[])
         {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
     Tensor<XDataType> b_scale_preshuffled(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
-                             {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
+                             {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
+                             Col{}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_k_n_host_result(
-        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
     Tensor<EDataType> e_t_k_n_device_result(
-        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
 
     e_t_k_n_device_result.SetZero();
     std::cout << "a0_t_k:   " << a0_t_k.mDesc << std::endl;
@@ -480,7 +483,7 @@ int main(int argc, char* argv[])
         e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
         invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
 
-        Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+        Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
 
         using ReferenceGemmInstance =
             ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,
diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp
index 24ab326391..2f7762386d 100644
--- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp
+++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp
@@ -266,10 +266,12 @@ int main(int argc, char* argv[])
     Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
     Tensor<XDataType> a1_t_k(HostTensorDescriptor(
         {tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_e_n_k(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
     Tensor<XDataType> b1_e_n_k(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
-                             {(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
+                             {(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN},
+                             Col{}));
 
     // A, B Scale preshuffle
     Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
@@ -278,12 +280,13 @@ int main(int argc, char* argv[])
         {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
     Tensor<XDataType> b_scale_preshuffled(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
-                             {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
+                             {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
+                             Col{}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_k_n_host_result(
-        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
     Tensor<EDataType> e_t_k_n_device_result(
-        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
 
     e_t_k_n_device_result.SetZero();
     std::cout << "a0_t_k:   " << a0_t_k.mDesc << std::endl;
@@ -477,7 +480,7 @@ int main(int argc, char* argv[])
         e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
         invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
 
-        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
 
         using ReferenceGemmInstance =
             ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,
diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp
index 08ed8e11fb..4ef068c41f 100644
--- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp
+++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp
@@ -296,12 +296,15 @@ int main(int argc, char* argv[])
     Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
     Tensor<XDataType> a1_t_k(HostTensorDescriptor(
         {tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_e_n_k(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
     Tensor<XDataType> b1_e_n_k(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
-                             {(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
+                             {(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN},
+                             Col{}));
     // B preshuffle
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(
+        HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
 
     // A, B Scale preshuffle
     Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
@@ -310,12 +313,13 @@ int main(int argc, char* argv[])
         {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
     Tensor<XDataType> b_scale_preshuffled(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
-                             {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
+                             {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
+                             Col{}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_k_n_host_result(
-        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
     Tensor<EDataType> e_t_k_n_device_result(
-        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
 
     e_t_k_n_device_result.SetZero();
     std::cout << "a0_t_k:   " << a0_t_k.mDesc << std::endl;
@@ -506,7 +510,7 @@ int main(int argc, char* argv[])
     {
         invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
 
-        Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+        Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
 
         using ReferenceGemmInstance =
             ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,
diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp
index 1b8a7a16e3..317b0f9f15 100644
--- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp
+++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp
@@ -270,14 +270,16 @@ int main(int argc, char* argv[])
 
     expert_ids.savetxt("expert_ids.txt", "int");
     sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
-    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
     Tensor<XDataType> a1_t_k_k(
         HostTensorDescriptor({tokens, topk, (K + ScaleBlockSize - 1) / ScaleBlockSize},
-                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1},
+                             Row{}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
     Tensor<XDataType> b1_e_n_k(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
-                             {(N * Scale_Stride_BN), 1, Scale_Stride_BN}));
+                             {(N * Scale_Stride_BN), 1, Scale_Stride_BN},
+                             Col{}));
 
     // A, B Scale preshuffle
     Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
@@ -286,7 +288,8 @@ int main(int argc, char* argv[])
         {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
     Tensor<XDataType> b_scale_preshuffled(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
-                             {N * Scale_Stride_BN, 1, Scale_Stride_BN}));
+                             {N * Scale_Stride_BN, 1, Scale_Stride_BN},
+                             Col{}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
     Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp
index 829bf9af24..5bb6454d2a 100644
--- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp
+++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp
@@ -268,16 +268,18 @@ int main(int argc, char* argv[])
         }
     }
 
-    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
     Tensor<XDataType> a1_t_k_k(
         HostTensorDescriptor({tokens, topk, (K + ScaleBlockSize - 1) / ScaleBlockSize},
-                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1},
+                             Row{}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
     Tensor<XDataType> b1_e_n_k(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
-                             {(N * Scale_Stride_BN), 1, Scale_Stride_BN}));
+                             {(N * Scale_Stride_BN), 1, Scale_Stride_BN},
+                             Col{}));
     // B preshuffle
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
 
     // A, B Scale preshuffle
     Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
@@ -286,7 +288,8 @@ int main(int argc, char* argv[])
         {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
     Tensor<XDataType> b_scale_preshuffled(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
-                             {N * Scale_Stride_BN, 1, Scale_Stride_BN}));
+                             {N * Scale_Stride_BN, 1, Scale_Stride_BN},
+                             Col{}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
     Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp
index efbd0f0c03..333f8a3d52 100644
--- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp
+++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp
@@ -303,16 +303,18 @@ int main(int argc, char* argv[])
 
     expert_ids.savetxt("expert_ids.txt", "int");
     sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
-    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
     Tensor<XDataType> a1_t_k_k(
         HostTensorDescriptor({tokens, topk, (K + ScaleBlockSize - 1) / ScaleBlockSize},
-                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1},
+                             Row{}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
     Tensor<XDataType> b1_e_n_k(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
-                             {(N * Scale_Stride_BN), 1, Scale_Stride_BN}));
+                             {(N * Scale_Stride_BN), 1, Scale_Stride_BN},
+                             Col{}));
     // B preshuffle
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
 
     // A, B Scale preshuffle
     Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
@@ -321,7 +323,8 @@ int main(int argc, char* argv[])
         {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
     Tensor<XDataType> b_scale_preshuffled(
         HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
-                             {N * Scale_Stride_BN, 1, Scale_Stride_BN}));
+                             {N * Scale_Stride_BN, 1, Scale_Stride_BN},
+                             Col{}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
     Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
diff --git a/example/68_gemm_add/CMakeLists.txt b/example/68_gemm_add/CMakeLists.txt
new file mode 100644
index 0000000000..af091d32e4
--- /dev/null
+++ b/example/68_gemm_add/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_custom_target(example_gemm_add_xdl)
+
+add_example_executable(example_gemm_add_xdl_fp16 gemm_add_xdl_fp16.cpp)
+add_example_dependencies(example_gemm_add_xdl example_gemm_add_xdl_fp16)
+
+
+add_example_executable(example_gemm_add_xdl_bf16 gemm_add_xdl_bf16.cpp)
+add_example_dependencies(example_gemm_add_xdl example_gemm_add_xdl_bf16)
+
+add_custom_target(example_gemm_add_wmma)
+
+add_example_executable(example_gemm_add_wmma_bf16 gemm_add_wmma_bf16.cpp)
+add_example_dependencies(example_gemm_add_wmma example_gemm_add_wmma_bf16)
+
+add_example_executable(example_gemm_add_wmma_fp16 gemm_add_wmma_fp16.cpp)
+add_example_dependencies(example_gemm_add_wmma example_gemm_add_wmma_fp16)
+
+
+
+
+
+
diff --git a/example/68_gemm_add/common.hpp b/example/68_gemm_add/common.hpp
new file mode 100644
index 0000000000..38e77a160f
--- /dev/null
+++ b/example/68_gemm_add/common.hpp
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+using Row_Tuple  = ck::Tuple<Row>;
+using F16_Tuple  = ck::Tuple<F16>;
+using BF16_Tuple = ck::Tuple<BF16>;
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideD = 4096;
+    ck::index_t StrideE = 4096;
+};
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+inline bool
+parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
+{
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 13)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        problem_size.M = std::stoi(argv[4]);
+        problem_size.N = std::stoi(argv[5]);
+        problem_size.K = std::stoi(argv[6]);
+
+        problem_size.StrideA = std::stoi(argv[7]);
+        problem_size.StrideB = std::stoi(argv[8]);
+        problem_size.StrideD = std::stoi(argv[9]);
+        problem_size.StrideE = std::stoi(argv[10]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << std::endl
+                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD,"
+                     "StrideE"
+                  << std::endl;
+        return false;
+    }
+
+    return true;
+}
diff --git a/example/68_gemm_add/gemm_add_wmma_bf16.cpp b/example/68_gemm_add/gemm_add_wmma_bf16.cpp
new file mode 100644
index 0000000000..30f0aa9153
--- /dev/null
+++ b/example/68_gemm_add/gemm_add_wmma_bf16.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = BF16;
+using DsDataType       = BF16_Tuple;
+using EDataType        = BF16;
+
+using Row_Tuple = ck::Tuple<Row>;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using DLayout  = Row;
+using DsLayout = Row_Tuple;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffleV3<
+    Row,
+    Row,
+    Row_Tuple,
+    Row,
+    BF16,
+    BF16,
+    BF16_Tuple,
+    BF16,
+    F32,
+    F32,
+    PassThrough,
+    PassThrough,
+    Add,
+    GemmSpec,
+    128,
+    128,
+    64,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<4, 32, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    0,
+    S<4, 32, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    0,
+    1,
+    1,
+    S<1, 32, 1, 4>,
+    S<8, 8, 8>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v1>;
+
+// clang-format on
+
+#include "run_gemm_add_example_wmma.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_example(argc, argv); }
diff --git a/example/68_gemm_add/gemm_add_wmma_fp16.cpp b/example/68_gemm_add/gemm_add_wmma_fp16.cpp
new file mode 100644
index 0000000000..caf245bf76
--- /dev/null
+++ b/example/68_gemm_add/gemm_add_wmma_fp16.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using DsDataType       = F16_Tuple;
+using EDataType        = F16;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using DLayout  = Row;
+using DsLayout = Row_Tuple;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffleV3<
+    Row,
+    Row,
+    Row_Tuple,
+    Row,
+    F16,
+    F16,
+    F16_Tuple,
+    F16,
+    F32,
+    F32,
+    PassThrough,
+    PassThrough,
+    Add,
+    GemmSpec,
+    128,
+    128,
+    64,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<4, 32, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    0,
+    S<4, 32, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    0,
+    1,
+    1,
+    S<1, 32, 1, 4>,
+    S<8, 8, 8>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v1>;
+
+// clang-format on
+
+#include "run_gemm_add_example_wmma.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_example(argc, argv); }
diff --git a/example/68_gemm_add/gemm_add_xdl_bf16.cpp b/example/68_gemm_add/gemm_add_xdl_bf16.cpp
new file mode 100644
index 0000000000..8861ad9cad
--- /dev/null
+++ b/example/68_gemm_add/gemm_add_xdl_bf16.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = BF16;
+using EDataType        = BF16;
+
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
+                                                                   BLayout,
+                                                                   ck::Tuple<DLayout>,
+                                                                   ELayout,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   AccDataType,
+                                                                   CShuffleDataType,
+                                                                   ck::Tuple<DDataType>,
+                                                                   EDataType,
+                                                                   AElementOp,
+                                                                   BElementOp,
+                                                                   CDEElementOp,
+                                                                   GemmSpec,
+                                                                   1,
+                                                                   256,
+                                                                   256,
+                                                                   128,
+                                                                   32,
+                                                                   8,
+                                                                   8,
+                                                                   16,
+                                                                   16,
+                                                                   8,
+                                                                   4,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   1,
+                                                                   1,
+                                                                   S<1, 32, 1, 8>,
+                                                                   4>;
+
+#include "run_gemm_add_example_xdl.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_example(argc, argv); }
diff --git a/example/68_gemm_add/gemm_add_xdl_fp16.cpp b/example/68_gemm_add/gemm_add_xdl_fp16.cpp
new file mode 100644
index 0000000000..0f21415311
--- /dev/null
+++ b/example/68_gemm_add/gemm_add_xdl_fp16.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using EDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
+                                                                   BLayout,
+                                                                   ck::Tuple<DLayout>,
+                                                                   ELayout,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   AccDataType,
+                                                                   CShuffleDataType,
+                                                                   ck::Tuple<DDataType>,
+                                                                   EDataType,
+                                                                   AElementOp,
+                                                                   BElementOp,
+                                                                   CDEElementOp,
+                                                                   GemmSpec,
+                                                                   1,
+                                                                   256,
+                                                                   256,
+                                                                   128,
+                                                                   32,
+                                                                   8,
+                                                                   8,
+                                                                   16,
+                                                                   16,
+                                                                   8,
+                                                                   4,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   1,
+                                                                   1,
+                                                                   S<1, 32, 1, 8>,
+                                                                   4>;
+
+#include "run_gemm_add_example_xdl.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_example(argc, argv); }
diff --git a/example/68_gemm_add/run_gemm_add_example_wmma.inc b/example/68_gemm_add/run_gemm_add_example_wmma.inc
new file mode 100644
index 0000000000..7a6c8ea56d
--- /dev/null
+++ b/example/68_gemm_add/run_gemm_add_example_wmma.inc
@@ -0,0 +1,145 @@
+#pragma once
+
+bool run_gemm_add(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideD, StrideE] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, 1>{StrideD},
+                               StrideE,
+                               1,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << device_op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+    }
+
+    return pass;
+}
+
+bool run_gemm_add_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm_add(problem_size, config);
+}
diff --git a/example/68_gemm_add/run_gemm_add_example_xdl.inc b/example/68_gemm_add/run_gemm_add_example_xdl.inc
new file mode 100644
index 0000000000..97c0765c27
--- /dev/null
+++ b/example/68_gemm_add/run_gemm_add_example_xdl.inc
@@ -0,0 +1,144 @@
+#pragma once
+
+bool run_gemm_add(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideD, StrideE] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, 1>{StrideD},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << device_op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+    }
+
+    return pass;
+}
+
+bool run_gemm_add_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm_add(problem_size, config);
+}
diff --git a/example/69_gemm_add_relu/CMakeLists.txt b/example/69_gemm_add_relu/CMakeLists.txt
new file mode 100644
index 0000000000..9ab3ef5a45
--- /dev/null
+++ b/example/69_gemm_add_relu/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_custom_target(example_gemm_add_relu_xdl)
+
+add_example_executable(example_gemm_add_relu_xdl_fp16 gemm_add_relu_xdl_fp16.cpp)
+add_example_dependencies(example_gemm_add_relu_xdl example_gemm_add_relu_xdl_fp16)
+
+add_example_executable(example_gemm_add_relu_xdl_bf16 gemm_add_relu_xdl_bf16.cpp)
+add_example_dependencies(example_gemm_add_relu_xdl example_gemm_add_relu_xdl_bf16)
+
+add_custom_target(example_gemm_add_relu_wmma)
+
+add_example_executable(example_gemm_add_relu_wmma_bf16 gemm_add_relu_wmma_bf16.cpp)
+add_example_dependencies(example_gemm_add_relu_wmma example_gemm_add_relu_wmma_bf16)
+
+add_example_executable(example_gemm_add_relu_wmma_fp16 gemm_add_relu_wmma_fp16.cpp)
+add_example_dependencies(example_gemm_add_relu_wmma example_gemm_add_relu_wmma_fp16)
diff --git a/example/69_gemm_add_relu/common.hpp b/example/69_gemm_add_relu/common.hpp
new file mode 100644
index 0000000000..311cbb2dfb
--- /dev/null
+++ b/example/69_gemm_add_relu/common.hpp
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+using Row_Tuple  = ck::Tuple<Row>;
+using F16_Tuple  = ck::Tuple<F16>;
+using BF16_Tuple = ck::Tuple<BF16>;
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideD = 4096;
+    ck::index_t StrideE = 4096;
+};
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+inline bool
+parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
+{
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 13)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        problem_size.M = std::stoi(argv[4]);
+        problem_size.N = std::stoi(argv[5]);
+        problem_size.K = std::stoi(argv[6]);
+
+        problem_size.StrideA = std::stoi(argv[7]);
+        problem_size.StrideB = std::stoi(argv[8]);
+        problem_size.StrideD = std::stoi(argv[9]);
+        problem_size.StrideE = std::stoi(argv[10]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << std::endl
+                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD,"
+                     "StrideE"
+                  << std::endl;
+        return false;
+    }
+
+    return true;
+}
diff --git a/example/69_gemm_add_relu/gemm_add_relu_wmma_bf16.cpp b/example/69_gemm_add_relu/gemm_add_relu_wmma_bf16.cpp
new file mode 100644
index 0000000000..5c4116cc44
--- /dev/null
+++ b/example/69_gemm_add_relu/gemm_add_relu_wmma_bf16.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = BF16;
+using DsDataType       = BF16_Tuple;
+using EDataType        = BF16;
+
+using Row_Tuple = ck::Tuple<Row>;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using DLayout  = Row;
+using DsLayout = Row_Tuple;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddRelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffleV3<
+    Row,
+    Row,
+    Row_Tuple,
+    Row,
+    BF16,
+    BF16,
+    BF16_Tuple,
+    BF16,
+    F32,
+    F32,
+    PassThrough,
+    PassThrough,
+    AddRelu,
+    GemmSpec,
+    128,
+    128,
+    64,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<4, 32, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    0,
+    S<4, 32, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    0,
+    1,
+    1,
+    S<1, 32, 1, 4>,
+    S<8, 8, 8>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v1>;
+
+// clang-format on
+
+#include "run_gemm_add_relu_example_wmma.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_relu_example(argc, argv); }
diff --git a/example/69_gemm_add_relu/gemm_add_relu_wmma_fp16.cpp b/example/69_gemm_add_relu/gemm_add_relu_wmma_fp16.cpp
new file mode 100644
index 0000000000..07f5197d21
--- /dev/null
+++ b/example/69_gemm_add_relu/gemm_add_relu_wmma_fp16.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using DsDataType       = F16_Tuple;
+using EDataType        = F16;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using DLayout  = Row;
+using DsLayout = Row_Tuple;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddRelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffleV3<
+    Row,
+    Row,
+    Row_Tuple,
+    Row,
+    F16,
+    F16,
+    F16_Tuple,
+    F16,
+    F32,
+    F32,
+    PassThrough,
+    PassThrough,
+    AddRelu,
+    GemmSpec,
+    128,
+    128,
+    64,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<4, 32, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    0,
+    S<4, 32, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    0,
+    1,
+    1,
+    S<1, 32, 1, 4>,
+    S<8, 8, 8>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v1>;
+
+// clang-format on
+
+#include "run_gemm_add_relu_example_wmma.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_relu_example(argc, argv); }
diff --git a/example/69_gemm_add_relu/gemm_add_relu_xdl_bf16.cpp b/example/69_gemm_add_relu/gemm_add_relu_xdl_bf16.cpp
new file mode 100644
index 0000000000..ac5586764c
--- /dev/null
+++ b/example/69_gemm_add_relu/gemm_add_relu_xdl_bf16.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = BF16;
+using EDataType        = BF16;
+
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddRelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
+                                                                   BLayout,
+                                                                   ck::Tuple<DLayout>,
+                                                                   ELayout,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   AccDataType,
+                                                                   CShuffleDataType,
+                                                                   ck::Tuple<DDataType>,
+                                                                   EDataType,
+                                                                   AElementOp,
+                                                                   BElementOp,
+                                                                   CDEElementOp,
+                                                                   GemmSpec,
+                                                                   1,
+                                                                   256,
+                                                                   256,
+                                                                   128,
+                                                                   32,
+                                                                   8,
+                                                                   8,
+                                                                   16,
+                                                                   16,
+                                                                   8,
+                                                                   4,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   1,
+                                                                   1,
+                                                                   S<1, 32, 1, 8>,
+                                                                   4>;
+
+#include "run_gemm_add_relu_example_xdl.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_relu_example(argc, argv); }
diff --git a/example/69_gemm_add_relu/gemm_add_relu_xdl_fp16.cpp b/example/69_gemm_add_relu/gemm_add_relu_xdl_fp16.cpp
new file mode 100644
index 0000000000..f9c963b4df
--- /dev/null
+++ b/example/69_gemm_add_relu/gemm_add_relu_xdl_fp16.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using EDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddRelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
+                                                                   BLayout,
+                                                                   ck::Tuple<DLayout>,
+                                                                   ELayout,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   AccDataType,
+                                                                   CShuffleDataType,
+                                                                   ck::Tuple<DDataType>,
+                                                                   EDataType,
+                                                                   AElementOp,
+                                                                   BElementOp,
+                                                                   CDEElementOp,
+                                                                   GemmSpec,
+                                                                   1,
+                                                                   256,
+                                                                   256,
+                                                                   128,
+                                                                   32,
+                                                                   8,
+                                                                   8,
+                                                                   16,
+                                                                   16,
+                                                                   8,
+                                                                   4,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   1,
+                                                                   1,
+                                                                   S<1, 32, 1, 8>,
+                                                                   4>;
+
+#include "run_gemm_add_relu_example_xdl.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_relu_example(argc, argv); }
diff --git a/example/69_gemm_add_relu/run_gemm_add_relu_example_wmma.inc b/example/69_gemm_add_relu/run_gemm_add_relu_example_wmma.inc
new file mode 100644
index 0000000000..27bd4de48d
--- /dev/null
+++ b/example/69_gemm_add_relu/run_gemm_add_relu_example_wmma.inc
@@ -0,0 +1,146 @@
+#pragma once
+
+bool run_gemm_add_relu(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideD, StrideE] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, 1>{StrideD},
+                               StrideE,
+                               1,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << device_op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+    }
+
+    return pass;
+}
+
+bool run_gemm_add_relu_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) &&
+           run_gemm_add_relu(problem_size, config);
+}
diff --git a/example/69_gemm_add_relu/run_gemm_add_relu_example_xdl.inc b/example/69_gemm_add_relu/run_gemm_add_relu_example_xdl.inc
new file mode 100644
index 0000000000..e2d45fca43
--- /dev/null
+++ b/example/69_gemm_add_relu/run_gemm_add_relu_example_xdl.inc
@@ -0,0 +1,145 @@
+#pragma once
+
+bool run_gemm_add_relu(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideD, StrideE] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, 1>{StrideD},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << device_op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+    }
+
+    return pass;
+}
+
+bool run_gemm_add_relu_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) &&
+           run_gemm_add_relu(problem_size, config);
+}
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 7bd628edf2..940e7bc5e6 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -69,7 +69,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
         #Do not build any XDL examples if gfx9 targets are not on the list
-        if(NOT EX_TARGETS MATCHES "gfx9" AND source_name MATCHES "_xdl")
+        if(NOT EX_TARGETS MATCHES "gfx9" AND NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source_name MATCHES "_xdl")
             message(DEBUG "removing xdl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
@@ -93,8 +93,8 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
             message(DEBUG "removing bf8 example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-        # Build fp8 gemm_multiply_multiply and moe only on gfx94/95
-        if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95")
+        # Build fp8 gemm_multiply_multiply and moe only on gfx94/95 and gfx12
+        if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95" AND NOT EX_TARGETS MATCHES "gfx12")
             if(source_name MATCHES "fp8" AND source_name MATCHES "(gemm_multiply_multiply|moe)")
                 message(DEBUG "Skipping ${source} example for current target")
                 list(REMOVE_ITEM FILE_NAME "${source}")
@@ -109,14 +109,14 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
     endforeach()
     if(FILE_NAME)
         if(source_name_list MATCHES "_xdl" AND NOT source_name_list MATCHES "_pk_i4")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx10-3-generic)
         elseif(source_name_list MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(source_name_list MATCHES "_mx") #only build mx example for gfx950
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
-        elseif(source_name_list MATCHES "_pk_i4") #only build these examples for gfx942 and gfx950
+        elseif(source_name_list MATCHES "_pk_i4") #only build these examples for gfx942 gfx950 and rdna3/4
             message(DEBUG "trimming targets for ${FILE_NAME}")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx10-3-generic)
         endif()
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
         add_executable(${EXAMPLE_NAME} ${FILE_NAME})
@@ -192,7 +192,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
         #Do not build any XDL examples if gfx9 targets are not on the list
-        if(NOT EX_TARGETS MATCHES "gfx9" AND source_name MATCHES "_xdl")
+        if(NOT EX_TARGETS MATCHES "gfx9" AND NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source_name MATCHES "_xdl")
             message(DEBUG "removing xdl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
@@ -206,7 +206,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(source_name_list MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx10-3-generic)
         elseif(source_name_list MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         endif()
diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index 5f495c76d8..b8ca26193d 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -1,7 +1,19 @@
+set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
+# Currently only gfx9 archs are supported by FMHA
+list(FILTER INST_TARGETS INCLUDE REGEX "gfx9")
+if(NOT INST_TARGETS)
+  message(WARNING "Skipping Tile Engine FMHA compilation: No supported GPU targets (gfx9) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+  return()
+endif()
+
 # validate user-specified fmha_fwd API list
 set(FMHA_FWD_KNOWN_APIS "fwd;fwd_splitkv;fwd_appendkv;pagedkv_prefill")
 set(FMHA_FWD_ENABLE_APIS "fwd" CACHE STRING
   "semicolon-separated list of APIs to generate (${FMHA_FWD_KNOWN_APIS}) & link, or \"all\".")
+if(BUILD_TESTING)
+  # Build instances of all APIs for tests
+  set(FMHA_FWD_ENABLE_APIS "all")
+endif()
 if(FMHA_FWD_ENABLE_APIS STREQUAL "all")
   set(FMHA_FWD_ENABLE_APIS ${FMHA_FWD_KNOWN_APIS})
 endif()
@@ -14,7 +26,7 @@ endforeach()
 
 # "fwd" is a must-have api for the fmha_fwd example, add it if not specified
 if(NOT "fwd" IN_LIST FMHA_FWD_ENABLE_APIS)
-  list(APPEND FMHA_FWD_ENABLE_APIS "fwd")
+  list(PREPEND FMHA_FWD_ENABLE_APIS "fwd")
 endif()
 
 file(GLOB_RECURSE CODE_GEN_SCRIPTS CONFIGURE_DEPENDS
@@ -35,10 +47,19 @@ set(FMHA_BWD_CODE_GEN_COMMON_ARGS
   ${CMAKE_CURRENT_LIST_DIR}/generate.py
   --api bwd
   --receipt 3
-  --optdim 32,64,128,256
+  --optdim 32,64,96,128,256
   # --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd...
 )
 
+# Reduce building time by disabling instances that are not currently used in the gtests
+# TODO: Consider to use a special receipt for testing only, or even two receipts: a small subset of
+# instances for quick CI runs and a larger subset for scheduled runs (the tests skip tests when
+# there is no corresponding instance for parameters).
+if(BUILD_TESTING)
+  # Filters are in the order of FMHA_FWD_KNOWN_APIS: fwd,fwd_splitkv_combine@fwd_splitkv,fwd_appendkv,pagedkv_prefill
+  list(APPEND FMHA_FWD_CODE_GEN_COMMON_ARGS --filter *_nlogits*_nskip*,*@*_nlogits*_nbias*,*,*_nlogits*_nskip*_pagedkv)
+endif()
+
 # generate a list of kernels, but not actually emit files at config sta
 execute_process(
   COMMAND ${Python3_EXECUTABLE} ${FMHA_FWD_CODE_GEN_COMMON_ARGS}
@@ -77,72 +98,104 @@ add_custom_command(
   DEPENDS ${CODE_GEN_SCRIPTS}
 )
 
-set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
-# not using add_example_executable() to add this target, since we don't want this to have
-# to be included in "make all/install/check"
-message(DEBUG "adding example ${EXAMPLE_FMHA_FWD}")
-add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp)
-target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
+set(FMHA_FWD_INSTANCES "tile_fmha_fwd_instances")
+set(FMHA_BWD_INSTANCES "tile_fmha_bwd_instances")
 
-set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
-# not using add_example_executable() to add this target, since we don't want this to have
-# to be included in "make all/install/check"
-message(DEBUG "adding example ${EXAMPLE_FMHA_BWD}")
-add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL fmha_bwd.cpp)
-target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
+message(DEBUG "adding instances ${FMHA_FWD_INSTANCES}")
+add_library(${FMHA_FWD_INSTANCES} OBJECT EXCLUDE_FROM_ALL)
+target_include_directories(${FMHA_FWD_INSTANCES} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${FMHA_FWD_INSTANCES} PRIVATE ${FMHA_FWD_GEN_BLOBS})
+set_source_files_properties(${FMHA_FWD_GEN_BLOBS} PROPERTIES LANGUAGE HIP)
+set_property(TARGET ${FMHA_FWD_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS})
+
+message(DEBUG "adding instances ${FMHA_BWD_INSTANCES}")
+add_library(${FMHA_BWD_INSTANCES} OBJECT EXCLUDE_FROM_ALL)
+target_include_directories(${FMHA_BWD_INSTANCES} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${FMHA_BWD_INSTANCES} PRIVATE ${FMHA_BWD_GEN_BLOBS})
+set_source_files_properties(${FMHA_BWD_GEN_BLOBS} PROPERTIES LANGUAGE HIP)
+set_property(TARGET ${FMHA_BWD_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS})
+
+set(FMHA_FWD_PRIVATE_COMPILE_OPTIONS)
+set(FMHA_BWD_PRIVATE_COMPILE_OPTIONS)
+set(FMHA_FWD_INTERFACE_COMPILE_OPTIONS)
+set(FMHA_BWD_INTERFACE_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+#       ... because they are auto-generated
+list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -Wno-undefined-func-template)
+list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -Wno-undefined-func-template)
+
+# Allow comparing floating points directly in order to check sentinel values
+list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -Wno-float-equal)
+list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -Wno-float-equal)
 
 # NOTE: this is dangerous since will change the whole kernel to flush denormals
 #       WIP with compiler team for an exp2 intrinsic..., then remove this
 if(NOT DEFINED FMHA_FWD_FAST_EXP2)
-  set(FMHA_FWD_FAST_EXP2 true)
+  set(FMHA_FWD_FAST_EXP2 ON)
 endif()
 
-set(EXAMPLE_FMHA_FWD_COMPILE_OPTIONS)
-set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
-
-# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
-#       ... because they are auto-generated
 if(FMHA_FWD_FAST_EXP2)
-  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+  list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
 else()
-  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
+  list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
 endif()
-list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -fgpu-flush-denormals-to-zero)
+list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -fgpu-flush-denormals-to-zero)
 
-# conditionally enable call to the fwd_splitkv API in fmha_fwd example
+# conditionally enable call to the fwd_splitkv API in fmha_fwd example and tests
 if("fwd_splitkv" IN_LIST FMHA_FWD_ENABLE_APIS)
-  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=1)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=1)
 else()
-  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=0)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=0)
 endif()
 
-# conditionally enable call to the fwd_appendkv API in fmha_fwd example
+# conditionally enable call to the fwd_appendkv API in fmha_fwd example and tests
 if("fwd_appendkv" IN_LIST FMHA_FWD_ENABLE_APIS)
-  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=1)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=1)
 else()
-  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=0)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=0)
 endif()
 
-# conditionally enable call to the pagedkv_prefill API in fmha_fwd example
+# conditionally enable call to the pagedkv_prefill API in fmha_fwd example and tests
 if("pagedkv_prefill" IN_LIST FMHA_FWD_ENABLE_APIS)
-  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=1)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=1)
 else()
-  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=0)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=0)
 endif()
 
 # conditionally specify the use of OCP_FP8
 if(CK_USE_OCP_FP8)
-  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+  list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()
 
-# Allow comparing floating points directly in order to check sentinel values
-list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal)
-list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-float-equal)
+# use RTN_ASM on float to bfloat16 conversion by default, align with FA upstream
+list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3)
+list(APPEND FMHA_BWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3)
 
-target_compile_options(${EXAMPLE_FMHA_FWD} PRIVATE ${EXAMPLE_FMHA_FWD_COMPILE_OPTIONS})
-target_compile_options(${EXAMPLE_FMHA_BWD} PRIVATE ${EXAMPLE_FMHA_BWD_COMPILE_OPTIONS})
+target_compile_options(${FMHA_FWD_INSTANCES}
+  PRIVATE ${FMHA_FWD_PRIVATE_COMPILE_OPTIONS}
+  INTERFACE ${FMHA_FWD_INTERFACE_COMPILE_OPTIONS})
+target_compile_options(${FMHA_BWD_INSTANCES} 
+  PRIVATE ${FMHA_BWD_PRIVATE_COMPILE_OPTIONS}
+  INTERFACE ${FMHA_BWD_INTERFACE_COMPILE_OPTIONS})
+
+set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
+set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
+
+message(DEBUG "adding example ${EXAMPLE_FMHA_FWD}")
+# not using add_example_executable() to add this target, since we don't want this to be included in
+# "make all/install/check"
+add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL example_fmha_fwd.cpp)
+target_link_libraries(${EXAMPLE_FMHA_FWD} ${FMHA_FWD_INSTANCES})
+target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
+message(DEBUG "adding example ${EXAMPLE_FMHA_BWD}")
+# not using add_example_executable() to add this target, since we don't want this to be included in
+# "make all/install/check"
+add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL example_fmha_bwd.cpp)
+target_link_libraries(${EXAMPLE_FMHA_BWD} ${FMHA_BWD_INSTANCES})
+target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 
 # add fmha_fwd_v3 example
 set(EXAMPLE_FMHA_FWD_V3 "tile_example_fmha_fwd_v3")
@@ -164,8 +217,20 @@ list(APPEND EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS
   -Wno-undefined-func-template
   --save-temps
 )
-target_compile_options(${EXAMPLE_FMHA_FWD_V3} PRIVATE ${EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS})
+set(EXAMPLE_FMHA_FWD_V3_COMPILE_DEFINITIONS)
 
+check_cxx_compiler_flag("-mllvm --amdgpu-disable-packed-fp32=1" HAS_DISABLE_PACKED_FP32)
+if(HAS_DISABLE_PACKED_FP32)
+  list(APPEND EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS
+    -mllvm --amdgpu-disable-packed-fp32=1
+  )
+  list(APPEND EXAMPLE_FMHA_FWD_V3_COMPILE_DEFINITIONS
+    -DCK_TILE_DISABLE_PACKED_FP32=1
+  )
+endif()
+
+target_compile_options(${EXAMPLE_FMHA_FWD_V3} PRIVATE ${EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS})
+target_compile_definitions(${EXAMPLE_FMHA_FWD_V3} PRIVATE ${EXAMPLE_FMHA_FWD_V3_COMPILE_DEFINITIONS})
 # TODO: we have to turn off this global prop, otherwise the progress bar generated
 # by cmake will print too many files, execvp: /bin/sh: Argument list too long
 # however, this property may affect global
diff --git a/example/ck_tile/01_fmha/README.md b/example/ck_tile/01_fmha/README.md
index f72d7afa02..2b872cb9b5 100644
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -36,6 +36,13 @@ args:
                 total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary
                 also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode)
         -s_k    seqlen_k (including new key/value), -1 means equal to s (default:-1)
+                also with "-s_k=s0,s1,s2..." comma-separated ints to set seqlen per batch (group mode)
+     -s_qpad    seqlen_q stride between 2 batches (group-mode optional) (default:-1)
+                Provide positive strides per-batch to simulate physical padding on Q
+     -s_kpad    seqlen_k stride between 2 batches, currently used in group-mode only  (default:-1)
+                for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride
+                along seqlen, instead of packed, same as xformer kv_padding,
+                must be greater than or equal to s_k
           -d    head dim for q, k (default:128)
         -d_v    head dim for v, -1 means equal to d (default:-1)
     -scale_s    scale factor of S. 0 means equal to 1/sqrt(hdim). (default:0)
@@ -74,11 +81,22 @@ args:
  -num_splits    number of splits for key/value. 0 to determine actual number by heuristic (default:1)
      -warmup    number of iterations before benchmark the kernel (default:5)
      -repeat    number of iterations to benchmark the kernel (default:20)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:fmha_fwd.json)
+ -q_eff_lens    Batch-mode only: per-batch effective seqlen for Q (exclude PAD) (default:"")
+                Comma-separated list of length 'b'. If empty, no override
+-kv_eff_lens    Batch-mode only: per-batch effective seqlen for KV (exclude PAD) (default:"")
+                Comma-separated list of length 'b'. If empty, no override
 ```
 Example 1: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
 Example 2: `./bin/tile_example_fmha_fwd -b=1 -h=8 -s=16384 -d=64 -drop_prefs=1 -drop_seed=10 -drop_offset=1234` will run a fmha case with 
   batch=1, nhead=8, sequence length=16384, hdim=64, drop_seed=0 (in GPU memory), drop_offset=1234 (in GPU memory) fp16 case
 
+## Padding Examples
+Example 3 (Group mode with padding): `./bin/tile_example_fmha_fwd -mode=1 -b=2 -h=8 -s=1024,2048 -s_k=1024,2048 -s_qpad=1536,3072 -s_kpad=1536,3072 -d=128` will run group mode with 2 batches having different sequence lengths (1024, 2048) but physically padded to (1536, 3072) respectively.
+
+Example 4 (Batch mode with effective lengths): `./bin/tile_example_fmha_fwd -mode=0 -b=2 -h=8 -s=2048 -s_k=2048 -d=128 -q_eff_lens=1024,1536 -kv_eff_lens=1024,1536` will run batch mode where all batches use 2048 as physical sequence length but have effective lengths of (1024, 1536) for Q and KV respectively.
+
 ## support features
 Currently we are still in rapid development stage, so more features/optimizations will be coming soon.
 
@@ -126,7 +144,16 @@ Note FA use bottom-right by default to express swa case, here we require you exp
 ### dropout
 TBD
 
+### sequence padding and variable length support
+We support sequence padding and variable-length processing in both batch and group modes fmha forward to handle real-world scenarios where sequences have different lengths.
+
+**Group Mode Padding**: Use `-s_qpad` and `-s_kpad` to specify physical stride between batches, enabling padded layouts. Each batch can have different logical sequence lengths (`-s`, `-s_k`) but use larger physical strides for memory alignment.
+
+**Batch Mode Variable Length**: Use `-q_eff_lens` and `-kv_eff_lens` to specify effective sequence lengths per batch. All batches share the same physical sequence length, but the kernel processes only the effective portions. This enables efficient variable-length attention without memory waste.
+
+Both approaches optimize memory access patterns while supporting flexible sequence length requirements commonly found in transformer inference scenarios.
+
 ## FP8 experimental support
 As described in [this blog](https://blog.hippoml.com/8bit-hippoattention-up-to-3x-faster-compared-to-flashattentionv2-8f9def90b482), we have an experimental support for fp8 fmha kernels, you can evaluate the performance by setting the arg `-prec=fp8` to the `tile_example_fmha_fwd`, on a gfx942 machine and ROCm 6.0+.
 
-Currently we only support `-vlayout=c`( `hdim*seqlen` for V matrix) and `-squant=1`(static quantization) with `hdim=128` for fp8 now. Full feature support will come later.
+Currently we only support `-vlayout=r`( `seqlen*hdim` for V matrix)  for fp8 and fp8bf16 now. Full feature support will come later.
diff --git a/example/ck_tile/01_fmha/bias.hpp b/example/ck_tile/01_fmha/bias.hpp
index f9dc656f63..c07232a13a 100644
--- a/example/ck_tile/01_fmha/bias.hpp
+++ b/example/ck_tile/01_fmha/bias.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -63,31 +63,45 @@ struct bias_info
     static bias_info decode(std::string str)
     {
         bias_info info{bias_enum::no_bias, 0};
-        if(str == "0" || str == "n")
+        auto found_0 = str.find(':');
+        if(found_0 != std::string::npos)
+        {
+            std::string t = str.substr(0, found_0);
+            std::string v = str.substr(found_0 + 1);
+            if(t == "e" || t == "elementwise")
+            {
+                info.type      = bias_enum::elementwise_bias;
+                info.rank_info = std::stoi(v);
+                if(info.rank_info < 0 || info.rank_info > 2)
+                    throw std::invalid_argument("invalid bias rank: " + str);
+            }
+            else if(t == "a" || t == "alibi")
+            {
+                info.type      = bias_enum::alibi;
+                info.rank_info = std::stoi(v);
+                if(info.rank_info < 0 || info.rank_info > 1)
+                    throw std::invalid_argument("invalid bias rank: " + str);
+            }
+            else
+            {
+                throw std::invalid_argument("invalid bias value: " + str);
+            }
+        }
+        else if(str == "0" || str == "n")
         {
             info.type = bias_enum::no_bias;
         }
-        else if(str.compare(0, 1, "1") == 0 || str.compare(0, 1, "e") == 0 ||
-                str.compare(0, 11, "elementwise") == 0)
+        else if(str == "1" || str == "e" || str == "elementwise")
         {
-            info.type    = bias_enum::elementwise_bias;
-            auto found_0 = str.find(':');
-            if(found_0 != std::string::npos)
-            {
-                std::string e  = str.substr(found_0 + 1);
-                info.rank_info = atoi(e.c_str());
-            }
+            info.type = bias_enum::elementwise_bias;
         }
-        else if(str.compare(0, 1, "2") == 0 || str.compare(0, 1, "a") == 0 ||
-                str.compare(0, 5, "alibi") == 0)
+        else if(str == "2" || str == "a" || str == "alibi")
         {
-            info.type    = bias_enum::alibi;
-            auto found_0 = str.find(':');
-            if(found_0 != std::string::npos)
-            {
-                std::string e  = str.substr(found_0 + 1);
-                info.rank_info = atoi(e.c_str());
-            }
+            info.type = bias_enum::alibi;
+        }
+        else
+        {
+            throw std::invalid_argument("invalid bias value: " + str);
         }
         return info;
     }
diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index 42a9d5148a..81d34484a5 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -1,16 +1,19 @@
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation
 
 FWD_DTYPE_MAP = {
+    "fp32"   : "FmhaFwdFp32",
     "fp16"   : "FmhaFwdFp16",
     "bf16"   : "FmhaFwdBf16",
     "fp8"    : "FmhaFwdFp8",
     "fp8fp16": "FmhaFwdFp8Fp16",
-    "fp8bf16": "FmhaFwdFp8Bf16"
+    "fp8bf16": "FmhaFwdFp8Bf16",
+    "fp8fp32": "FmhaFwdFp8Fp32"
 }
 
 BWD_DTYPE_MAP = {
+    "fp32": "FmhaBwdFp32",
     "fp16": "FmhaBwdFp16",
     "bf16": "FmhaBwdBf16"
 }
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
index 0d8f366d8a..e2f69fa49a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -601,6 +601,13 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
+
+                # fp32 only
+                if receipt == 800 or receipt == 801:
+                    cond = dtype == 'fp32'
+                    if not cond:
+                        continue
+
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 0391191fb2..7319ef7ea1 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -50,16 +50,10 @@ using fmha_bwd_shape_{F_idx} = ck_tile::TileFmhaBwdShape<fmha_block_tile_{F_idx}
                                                          fmha_warp_tile2_{F_idx},
                                                          {F_maxq}>;
 
-using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<false,  /* kPadSeqLenQ */
-                                                       false,  /* kPadSeqLenK */
-                                                       {F_dpad},
+using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaBwdTraits<{F_dpad},
                                                        {F_dvpad},
-                                                       false,
                                                        {F_bias},
                                                        {F_dbias},
-                                                       false,
-                                                       false,
-                                                       false,
                                                        {F_occupancy}>;
 using fmha_mask_{F_idx}      = {F_mask};
 using fmha_dropout_{F_idx}   = {F_dropout};
@@ -94,19 +88,19 @@ using fmha_bwd_dk_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
     ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                       typename FmhaBwdTypeConfig<{F_dtype}>::KGradDataType,
                                       false,
-                                      {F_dpad}>>;
+                                      ({F_dpad} > 0)>>;
 
 using fmha_bwd_dv_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
     ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                       typename FmhaBwdTypeConfig<{F_dtype}>::VGradDataType,
                                       false,
-                                      {F_dvpad}>>;
+                                      ({F_dvpad} > 0)>>;
 
 using fmha_bwd_dq_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
     ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                       typename FmhaBwdTypeConfig<{F_dtype}>::QGradDataType,
                                       false,
-                                      {F_dpad}>>;
+                                      ({F_dpad} > 0)>>;
 
 using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
     ck_tile::FmhaBwdDQDKDVKernel<fmha_bwd_pipeline_{F_idx},
@@ -125,7 +119,8 @@ using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim},
                                                          {F_dvpad},
                                                          {F_deterministic},
                                                          {F_trload},
-                                                         {F_maxq}>;
+                                                         {F_maxq},
+                                                         {F_bn0}>;
 
 #include <iostream>
 
@@ -199,7 +194,7 @@ float fmha_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a)
 
 template <>
 float fmha_bwd<2>(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_config& s){{
-    const bool has_load_tr = ck_tile::is_load_tr_supported();
+    [[maybe_unused]] const bool has_load_tr = ck_tile::is_load_tr_supported();
     float r = -1;
 {F_dispatch}
     return r;
@@ -218,10 +213,10 @@ def FMHA_BWD_API_COND_STATEMENT(F_cond: str, F_body: str, *, indent=0, if_ = 0)
 
 FMHA_BWD_API_INNER_DISPATCH="""
 {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
-        ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
-    using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
-    using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}, {F_trload}, {F_maxq}>;
-    using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>;
+        ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic}){F_cond_extra}) {{
+    using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, ({F_dvpad} > 0)>;
+    using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}, {F_trload}, {F_maxq}, {F_bn0}>;
+    using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, ({F_dpad} > 0), {F_deterministic}, {F_convert_dq_bn0}>;
     r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, std::conditional_t<{F_convert_dq_enabled}, convert_dq_trait_, void>>(s, a);
     return r;
 }}
@@ -277,8 +272,8 @@ class FmhaBwdDQDKDVKernel:
     F_hdim          : int  # hdim
     F_dtype         : str  # data type
     F_tile          : FmhaBwdDQDKDVTileSize
-    F_dpad          : str  #
-    F_dvpad         : str  #
+    F_dpad          : Literal[0, 8 ,1]
+    F_dvpad         : Literal[0, 8 ,1]
     F_bias          : str  #
     F_dbias         : str  #
     F_dropout       : str  #
@@ -319,8 +314,8 @@ class FmhaBwdDQDKDVKernel:
                 F_wm1           = self.F_tile.F_wm1,
                 F_wn1           = self.F_tile.F_wn1,
                 F_wk1           = self.F_tile.F_wk1,
-                F_dpad          = BOOL_MAP[self.F_dpad],
-                F_dvpad         = BOOL_MAP[self.F_dvpad],
+                F_dpad          = self.F_dpad,
+                F_dvpad         = self.F_dvpad,
                 F_bias          = BIAS_MAP[self.F_bias],
                 F_dbias         = BOOL_MAP[self.F_dbias],
                 F_dropout       = DROPOUT_MAP[self.F_dropout],
@@ -336,8 +331,8 @@ class FmhaBwdDQDKDVKernel:
     def name(self) -> str:
         def pad_name() -> str:
             n = ''
-            if self.F_dpad == 't' : n += 'd'
-            if self.F_dvpad == 't' : n += 'dv'
+            if self.F_dpad : n += f'd{self.F_dpad}'
+            if self.F_dvpad : n += f'dv{self.F_dvpad}'
             if n != '' : n = 'p' + n
             return n
         pn = pad_name()
@@ -375,10 +370,18 @@ class FmhaBwdDQDKDVKernel:
 # TODO: design a more practical way to do it
 # this is current supported tile size.
 def get_dq_dk_dv_tiles(dtype : str, tr_load: str) -> List[FmhaBwdDQDKDVTileSize]:
-    if (dtype == 'fp16' or dtype == 'bf16') and tr_load == 'f':
+    if dtype == 'fp32' and tr_load == 'f':
+        return [
+            #                     bm0, bn0, bk0, bk1, bk2, bk3, bk4, bhdq, bhdv,
+            FmhaBwdDQDKDVTileSize( 32, 128,  32,  32,  32,  32,  64,   32,   32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 16, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 16,  64,  64,  16,  64,  16,  16,   64,   64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 16, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 16,  64, 128,  16, 128,  16,  16,  128,  128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 16, 16, 16, 16, 1),
+        ]
+    elif (dtype == 'fp16' or dtype == 'bf16') and tr_load == 'f':
         return [
             FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
             FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 32, 128,  96, 32,  96, 32, 32,  96,  96, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
             FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
             # FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
             FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
@@ -386,6 +389,7 @@ def get_dq_dk_dv_tiles(dtype : str, tr_load: str) -> List[FmhaBwdDQDKDVTileSize]
     elif (dtype == 'fp16' or dtype == 'bf16') and tr_load == 't':
         return [
                 FmhaBwdDQDKDVTileSize( 32, 128, 128, 32, 128, 32, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 32, 1),
+                FmhaBwdDQDKDVTileSize( 16, 192, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
                 # FmhaBwdDQDKDVTileSize( 16, 32, 128, 16, 128, 16, 32, 128, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 16, 1, 16),
                 FmhaBwdDQDKDVTileSize( 16,  16, 128, 16, 128, 16, 16, 128, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 16, 2, 16),
         ]
@@ -519,7 +523,8 @@ using convert_dq_trait_{F_idx} = fmha_bwd_convert_dq_traits_<{F_hdim},
                                                              {F_mode},
                                                              {F_spad},
                                                              {F_dpad},
-                                                             {F_deterministic}>;
+                                                             {F_deterministic},
+                                                             {F_bn0}>;
 
 #include <iostream>
 
@@ -618,8 +623,8 @@ class FmhaBwdApiTrait:
     dbias         : str
     dropout       : str
     spad1d        : str # spad for 1d kernels (dot/convert)
-    dpad          : str
-    dvpad         : str
+    dpad          : Literal[0, 1, 8]
+    dvpad         : Literal[0, 1, 8]
     deterministic : str
     mask_impl     : str
     tr_load       : str
@@ -648,13 +653,24 @@ class FmhaBwdApiTrait:
 
     @property
     def dcheck(self) -> str:
-        if self.dpad == 't': return f'a.hdim_q % {self.bhdq} != 0'
-        else :               return f'a.hdim_q % {self.bhdq} == 0'
+        if self.dpad == 0: return f'a.hdim_q % {self.bhdq} == 0'
+        else:              return f'a.hdim_q % {self.dpad} == 0'
 
     @property
     def dvcheck(self) -> str:
-        if self.dvpad == 't': return f'a.hdim_v % {self.bhdv} != 0'
-        else :                return f'a.hdim_v % {self.bhdv} == 0'
+        if self.dvpad == 0: return f'a.hdim_v % {self.bhdv} == 0'
+        else:               return f'a.hdim_v % {self.dvpad} == 0'
+
+    @property
+    def extra_cond(self) -> str:
+        if self.tr_load == 't' and self.tile.max_seq_q == 0 and self.tile.F_bn0 == 128:
+            return "&& (a.seqlen_k <= 256)"
+        else:
+            return ""
+    
+    @property
+    def convert_dq_bn0(self) -> int:
+        return self.tile.F_bn0 if self.deterministic == 't' else 0
 
     @property
     def dot_do_o_kernel(self) -> FmhaBwdOGradDotOKernel:
@@ -663,8 +679,9 @@ class FmhaBwdApiTrait:
         def get_occupancy(dtype, hdim):
             return 2
 
+        F_dvpad = 't' if self.dvpad else 'f'
         return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1d,
-            F_dvpad=self.dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim))
+            F_dvpad=F_dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim))
 
     @property
     def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel:
@@ -679,8 +696,9 @@ class FmhaBwdApiTrait:
         def get_occupancy(dtype, hdim):
             return 2
 
+        F_dpad = 't' if self.dpad else 'f'
         return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype,
-            F_bm0=M0_1D, F_bn0=self.tile.F_bn0, F_spad=self.spad1d, F_dpad=self.dpad,
+            F_bm0=M0_1D, F_bn0=self.convert_dq_bn0, F_spad=self.spad1d, F_dpad=F_dpad,
             F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim),
             F_deterministic=self.deterministic, disabled=self.tile.max_seq_q != 0)
 
@@ -706,9 +724,10 @@ class FmhaBwdApiPool:
                 F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
                 F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
                 F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype],
-                F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=trait.dpad, F_dvpad=trait.dvpad,
                 F_deterministic=BOOL_MAP[trait.deterministic], F_trload=BOOL_MAP[trait.tr_load], F_maxq=trait.tile.max_seq_q,
-                F_convert_dq_enabled=BOOL_MAP[not trait.convert_dq_kernel.disabled])
+                F_convert_dq_enabled=BOOL_MAP[not trait.convert_dq_kernel.disabled], F_bn0=trait.tile.F_bn0, F_cond_extra=trait.extra_cond,
+                F_convert_dq_bn0=trait.convert_dq_bn0)
             i += 1
         return inners
 
@@ -757,7 +776,7 @@ class FmhaBwdApiPool:
             per_tr_load += FMHA_BWD_API_COND_STATEMENT(F_cond=tr_load_cond_map[tr_load], F_body=per_max_seq_q, indent=4)
         if not per_tr_load:
             # empty string we add some ignore to suppress warning in api
-            per_tr_load += '    (void)t ; (void)s ; (void)a;'
+            per_tr_load += '    (void)t ; (void)s ; (void)a; (void)has_load_tr;'
         result = FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_tr_load)
         return result.replace('\n\n', '\n')
 
@@ -778,7 +797,10 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
 
     for dtype, tr_load in itertools.product(BWD_DTYPE_MAP.keys(), ["t", "f"]):
         tiles: Any = get_dq_dk_dv_tiles(dtype, tr_load)
-        for tile, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(tiles, MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)):
+        dpad_options = itertools.product(*([[0, 8, 1]] * 2))
+        tf = ["t", "f"]
+        for tile, mode, mask, bias, dbias, dropout, spad1d, (dpad, dvpad), deterministic in itertools.product(
+                tiles, MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), tf, DROPOUT_MAP.keys(), tf, dpad_options, tf):
             assert isinstance(tile, FmhaBwdDQDKDVTileSize), "tile must be FmhaBwdDQDKDVTileSize"
             hdim = tile.F_bhdq
             if (mode == "group") and (spad1d == "f"):
@@ -789,8 +811,15 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
                 continue
             if ("wg32" in dropout):
                 continue
-            if tr_load == "t" and (dpad == "t" or dvpad == "t"):
+            if tr_load == "t":
                 continue  # tr_load cannot work with dpad or dvpad
+            else: # tr_load == "f"
+                # do not generate instance with only 1 of dpad/dvpad being 8
+                if dpad != dvpad and dpad == 8:
+                    continue
+            if optdim_list != [-1]:
+                if hdim not in optdim_list:
+                    continue
             t = FmhaBwdApiTrait(idx=0, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad1d=spad1d, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl, tr_load=tr_load)
 
             if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o):
@@ -799,9 +828,6 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
                 continue
             if not fnmatch.fnmatch(t.convert_dq_kernel.name, filter_convert_dq):
                 continue
-            if optdim_list != [-1]:
-                if hdim not in optdim_list:
-                    continue
 
             # Flash attention integration
             if receipt == 2:
@@ -846,6 +872,30 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
                 cond = dtype in ['fp16', 'bf16']
                 if not cond:
                     continue
+
+            # fp32 only, all variations
+            if receipt == 800:
+                cond = dtype == 'fp32'
+                cond &= dpad == dvpad
+                if not cond:
+                    continue
+            # fp32 only, minimal set of parameters
+            elif receipt == 801:
+                cond = dtype == 'fp32'
+                cond &= hdim in [64, 128]
+                cond &= dpad == dvpad
+                cond &= mode == 'batch'
+                cond &= bias == 'no'
+                cond &= dropout == 'no'
+                cond &= mask == 's_no'
+                cond &= deterministic == "f"
+                if not cond:
+                    continue
+            else:
+                # Don't build fp32 by default
+                if dtype == 'fp32':
+                    continue
+
             gen_dot_do_o[t.dot_do_o_kernel] = True
             gen_dq_dk_dv[t.dq_dk_dv_kernel] = True
             if not t.convert_dq_kernel.disabled:
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index d9452206e7..f898d5f7b2 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -25,6 +25,7 @@ DTYPE_BITS = {
 
 K0_MAX_SUBMAX_MAP = {
     32 : 32,
+    48 : 48,
     64 : 64,
     96 : 128,
     128: 128,
@@ -163,8 +164,8 @@ float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config&
     [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
         return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
     }};
-    
-    const bool has_load_tr = ck_tile::is_load_tr_supported();
+
+    [[maybe_unused]] const bool has_load_tr = ck_tile::is_load_tr_supported();
 
 {F_dispatch}
     return r;
@@ -248,22 +249,21 @@ class FmhaFwdApiTrait:
             if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_q % {self.bm0} == 0'
         else: assert False
-    
-    @property
-    def seqtune(self) -> str:
-        if self.bm0 == 128: return 'true/*fall back to largest tile*/'                  # group mode only generate spad/skpad == true
-        else: 
+
+    def seqtune(self, max_bm0 : int) -> str:
+        if self.bm0 == max_bm0: return 'true/*fall back to largest tile*/'
+        else:
             return f'a.seqlen_q <= {self.bm0}'
 
     @property
     def skcheck(self) -> str:
         if self.mode == 'group': return 'true/*group mode skpad always true*/'                  # group mode only generate spad/skpad == true
         if self.pipeline_tag == 'qr_async':
-            if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
-            else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
+            if self.skpad == 't' : return f'(a.cu_seqlen_kv_ptr != nullptr) || (a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0)'
+            else :                 return f'(a.cu_seqlen_kv_ptr == nullptr) && (a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0)'
         elif self.pipeline_tag in ['qr', 'qs']:
             if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
-            else :                return f'a.seqlen_k % {self.bn0} == 0'
+            else :                 return f'(a.cu_seqlen_kv_ptr == nullptr) && (a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0)'
         elif self.pipeline_tag == 'qr_async_trload':
             if self.skpad == 't' : return 'true'
             else:                  return 'true'
@@ -351,7 +351,7 @@ class FmhaFwdPipeline:
 
         if self.F_squant == 't' : n += '_squant'
         else: n += '_nsquant'
-        
+
         if self.F_trload == 't' : n += '_trload'
         else: n += '_ntrload'
 
@@ -378,7 +378,7 @@ class FmhaFwdApiPool:
             "t": "has_load_tr",
             "f": "true"
         }
-        
+
         per_tr_load =str()
         for tr_load in ["t", "f"]:
             per_dtypes=str()
@@ -386,6 +386,7 @@ class FmhaFwdApiPool:
                 per_hdim_case=str()
                 for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
                     traits=[t for t in self.pool[dtype][(hdim, hdim_v)] if tr_load == t.tr_load]
+                    max_bm0 = max((t.bm0 for t in traits), default=0)
                     inners=str()
                     for k, trait in enumerate(traits):
                         if_k = 'if' if k == 0 else 'else if'
@@ -393,7 +394,7 @@ class FmhaFwdApiPool:
                                        F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
                                        F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
                                        F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip], F_trload=BOOL_MAP[trait.tr_load],
-                                       F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_seqtune=trait.seqtune, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                       F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_seqtune=trait.seqtune(max_bm0), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                        F_constraint=trait.constraint,
                                        F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                        F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
@@ -534,7 +535,20 @@ class KernelComponentFactory:
     # this is current supported tile size per hdim
     @staticmethod
     def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
-        if dtype == 'fp16' or dtype == 'bf16':
+        if dtype == 'fp32':
+            return {
+                #                             bm0, bn0, bk0, bn1, bk1,
+                ( 32,  32) : [FmhaFwdTileSize( 64,  64,  16,  32,  32,   32,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+                ( 48,  48) : [FmhaFwdTileSize( 32, 128,  16,  48,  16,   48,  2, 1, 1,  2, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+                              FmhaFwdTileSize(128,  64,  16,  48,  32,   48,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+                ( 64,  64) : [FmhaFwdTileSize( 64,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+                ( 96, 128) : [FmhaFwdTileSize(128,  64,  32, 128,  32,   96,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+                (128, 128) : [FmhaFwdTileSize( 32, 128,  32, 128,  16,  128,  2, 1, 1,  2, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+                              FmhaFwdTileSize(128,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+                (192, 192) : [FmhaFwdTileSize( 64,  64,  32, 192,  32,  192,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+                (256, 256) : [FmhaFwdTileSize( 64,  64,  32, 256,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+            }
+        elif dtype == 'fp16' or dtype == 'bf16':
             return {
                 (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (64, 64)  : [FmhaFwdTileSize(16, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
@@ -550,12 +564,16 @@ class KernelComponentFactory:
                 (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
             }
-        elif dtype == 'fp8' or dtype == 'bf8':
+        elif dtype == 'fp8' or dtype == 'fp8bf16':
             return {
                 (64,64 )  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
                 (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
                 (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
             }
+        elif dtype == 'fp8fp32':
+            return {
+                (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
+            }
         else:
             return None
 
@@ -567,9 +585,15 @@ class KernelComponentFactory:
         # TODO: the order of List matters! the later in this list will be also be checked later
         # TODO: currently for qr pipeline, let 't' padding to appear later!!
         # TODO: how to design this more generic?
-        squant = 't' if dtype == 'fp8' else 'f'
         pipelines = []
-        if dtype in ['fp16', 'bf16']:
+        if dtype in ['fp32']:
+            squant = 'f'
+            for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
+                pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+        elif dtype in ['fp16', 'bf16']:
+            squant = 'f'
             for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
                 if hdim == 256 and hdim_v == 256:
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
@@ -589,11 +613,12 @@ class KernelComponentFactory:
                             pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 't'))
                     if receipt == 1 and bias != "bias":
                         pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f')) # TODO: cover arbitraty hdim
-        elif dtype in ['fp8', 'bf8']:
+        elif dtype in ['fp8', 'fp8bf16', 'fp8fp32']:
             # no need lse/dropout kernels
-            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f', 'f'))
-        elif dtype in ['fp8fp16', 'fp8bf16']:
+            for logits, squant, mask, bias in itertools.product(["f"], ["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
+                pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f', 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f', 'f'))
+        elif dtype in ['fp8fp16', 'bf8']:
             # TODO
             None
         else:
@@ -621,6 +646,8 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
             continue
         #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
         for ((hdim, hdim_v), tiles), mode in itertools.product(d.items(), MODE_MAP.keys()):
+            for tile, next_tile in zip(tiles, tiles[1:]):
+                assert next_tile.F_bm0 >= tile.F_bm0, 'Tiles must be ordered by increasing bm0'
             for tile, pipeline in itertools.product(tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl)):
                 if mode == "group":
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
@@ -630,12 +657,13 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                     if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                         continue
-                if pipeline.tag != 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 != 128) or ((hdim, hdim_v) != (128, 128) and tile.F_bm0 != 128)):
-                    # non qr_async_trload only support km0=128 tile size when hdim is not 128
-                    # non qr_async only support kn0=128 tile size when hdim is 128
-                    continue
-                if pipeline.tag == 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 == 128) or ((hdim, hdim_v) not in [(64, 64), (128, 128)])):
-                    continue
+                if dtype != 'fp32':
+                    if pipeline.tag != 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 != 128) or ((hdim, hdim_v) != (128, 128) and tile.F_bm0 != 128)):
+                        # non qr_async_trload only support km0=128 tile size when hdim is not 128
+                        # non qr_async only support kn0=128 tile size when hdim is 128
+                        continue
+                    if pipeline.tag == 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 == 128) or ((hdim, hdim_v) not in [(64, 64), (128, 128)])):
+                        continue
                 # logits_soft_cap is only allowed if no bias
                 if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
                     continue
@@ -674,27 +702,61 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                         continue
                 # Aiter(mha_fwd) integration
                 elif receipt == 100:
-                    cond = dtype in ['fp16', 'bf16']
+                    cond = dtype in ['fp16', 'bf16', 'fp8bf16']
                     cond &= mode == 'batch'
                     cond &= pipeline.F_vlayout == 'row'
-                    cond &= pipeline.F_squant == 'f'
+                    if dtype == 'fp8bf16':
+                        cond &= hdim == 128
                     if not cond:
                         continue
                 # Aiter(mha_varlen_fwd) integration
                 elif receipt == 200:
-                    cond = dtype in ['fp16', 'bf16']
+                    cond = dtype in ['fp16', 'bf16', 'fp8bf16']
                     cond &= mode == 'group'
                     cond &= pipeline.F_vlayout == 'row'
-                    cond &= pipeline.F_squant == 'f'
+                    if dtype == 'fp8bf16':
+                        cond &= hdim == 128
                     if not cond:
                         continue
                 # aiter::mha_fwd C++ api integration
                 elif receipt == 600:
-                    cond = dtype in ['fp16', 'bf16']
+                    cond = dtype in ['fp16', 'bf16', 'fp8bf16']
                     cond &= pipeline.F_vlayout == 'row'
-                    cond &= pipeline.F_squant == 'f'
+                    if dtype == 'fp8bf16':
+                        cond &= hdim == 128
                     if not cond:
                         continue
+                elif receipt == 888:
+                    cond = dtype in ['fp8', 'fp8bf16', 'fp8fp32']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= hdim == 128
+                    if not cond:
+                        continue
+
+                # fp32 only, all variations
+                if receipt == 800:
+                    cond = dtype == 'fp32'
+                    cond &= pipeline.F_skip == 'f'
+                    cond &= pipeline.F_logits == 'f'
+                    if not cond:
+                        continue
+                # fp32 only, minimal set of parameters
+                elif receipt == 801:
+                    cond = dtype == 'fp32'
+                    cond &= hdim in [48, 128]
+                    cond &= mode == 'batch'
+                    cond &= pipeline.F_bias == 'no'
+                    cond &= pipeline.F_lse == 'f'
+                    cond &= pipeline.F_dropout == 'f'
+                    cond &= pipeline.F_skip == 'f'
+                    cond &= pipeline.F_logits == 'f'
+                    cond &= pipeline.F_mask == 's_no'
+                    if not cond:
+                        continue
+                else:
+                    # Don't build fp32 by default
+                    if dtype == 'fp32':
+                        continue
 
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
index 0ebeaddf9c..38491b56c4 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -184,6 +184,9 @@ class FmhaFwdAppendKVApiPool:
                 per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim, F_inner_dispatch=inners)
             if_i = 'if' if i == 0 else 'else if'
             per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+        if not per_dtypes:
+            # empty string we add some ignore to suppress warning in api
+            per_dtypes += '    (void)t ; (void)s ; (void)a;'
         return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_APPENDKV_API.format(F_dispatch = per_dtypes)
 
 @dataclass
@@ -341,6 +344,13 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, op
                     cond &= pipeline.F_vlayout == 'row'
                     if not cond:
                         continue
+
+                # fp32 only
+                if receipt == 800 or receipt == 801:
+                    cond = dtype == 'fp32'
+                    if not cond:
+                        continue
+
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 1dd8f0e3c6..281357ef1e 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation
 
 import copy
@@ -347,8 +347,8 @@ class FmhaFwdSplitKVApiTrait:
             if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
             else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
         elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']:
-            if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
-            else :                return f'a.seqlen_k % {self.bn0} == 0'
+            if self.skpad == 't' : return f'true /*a.seqlen_k_ptr != nullptr || a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_k_ptr == nullptr && a.seqlen_k % {self.bn0} == 0'
         else: assert False
 
     @property
@@ -645,7 +645,6 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
         return {
             '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
             '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
         }
     else:
         return None
@@ -769,6 +768,13 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, opt
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
+
+                # fp32 only
+                if receipt == 800 or receipt == 801:
+                    cond = dtype == 'fp32'
+                    if not cond:
+                        continue
+
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 
@@ -835,6 +841,13 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt, optdim
                     cond = dtype in ['fp16', 'bf16']
                     if not cond:
                         continue
+
+                # fp32 only
+                if receipt == 800 or receipt == 801:
+                    cond = dtype == 'fp32'
+                    if not cond:
+                        continue
+
                 gen.append(k)
 
     return gen
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
index e468e82ed5..3624b7b387 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation
 
 import copy
@@ -189,8 +189,8 @@ class FmhaFwdApiTrait:
             if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
             else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
         elif self.pipeline_tag in ['qr_pagedkv', 'qs']:
-            if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
-            else :                return f'a.seqlen_k % {self.bn0} == 0'
+            if self.skpad == 't' : return f'true /*a.seqlen_k_ptr != nullptr || a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_k_ptr == nullptr && a.seqlen_k % {self.bn0} == 0'
         else: assert False
 
     @property
@@ -465,14 +465,14 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
         squant = 't' if dtype == 'fp8' else 'f'
         pipelines = []
         if dtype in ['fp16', 'bf16']:
-            for logits, mask, bias,  pagedkv, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(),  ["t", "f"], ["t", "f"]):
-                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'col', 't', 'f', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
-                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'col', 't', 't', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
+            for logits, mask, bias,  pagedkv, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(),  ["t"], ["f"]):
                 pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'row', 't', 'f', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
                 pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'row', 't', 't', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
         elif dtype in ['fp8', 'bf8']:
-            # TODO
-            None
+            # no need lse/dropout kernels
+            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'row', 'f', 'f', 'f', 'f', logits, bias, 'f', 't', squant, mask, 'f'))
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'row', 't', 't', 'f', 'f', logits, bias, 'f', 't', squant, mask, 'f'))
         elif dtype in ['fp8fp16', 'fp8bf16']:
             # TODO
             None
@@ -560,6 +560,12 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     if not cond:
                         continue
 
+                # fp32 only
+                if receipt == 800 or receipt == 801:
+                    cond = dtype == 'fp32'
+                    if not cond:
+                        continue
+
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 
diff --git a/example/ck_tile/01_fmha/example_fmha_bwd.cpp b/example/ck_tile/01_fmha/example_fmha_bwd.cpp
new file mode 100644
index 0000000000..73b3c1e619
--- /dev/null
+++ b/example/ck_tile/01_fmha/example_fmha_bwd.cpp
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "fmha_bwd.hpp"
+#include "fmha_bwd_runner.hpp"
+
+#include <string>
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "whether do CPU validation or not")
+        .insert("mode", "0", "kernel mode. 0:batch, 1:group")
+        .insert("b", "2", "batch size")
+        .insert("h", "8", "num of head, for q")
+        .insert("h_k",
+                "-1",
+                "num of head, for k/v, -1 means equal to h\n"
+                "if not equal to h, then this is GQA/MQA case")
+        .insert("s",
+                "3328",
+                "seqlen_q. if group-mode, means the average value of seqlen_q\n"
+                "total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary\n"
+                "also with \"-s=s0,s1,s2...\" comma-separated ints to set seqlen per batch "
+                "(group mode)")
+        .insert("s_k",
+                "-1",
+                "seqlen_k, -1 means equal to s\n"
+                "also with \"-s_k=s0,s1,s2...\" comma-separated ints to set seqlen per batch "
+                "(group mode)")
+        .insert("d", "128", "head dim for q, k")
+        .insert("d_v", "-1", "head dim for v, -1 means equal to d")
+        .insert("scale", "0", "scale factor. 0 means equal to 1/sqrt(hdim)")
+        .insert("iperm",
+                "1",
+                "permute input\n"
+                "if true, will be b*h*s*d, else b*s*h*d")
+        .insert("operm", "1", "permute output")
+        .insert("bias",
+                "n",
+                "n or 0, no bias\n"
+                "e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s\n"
+                "a(libi) or 2, alibi with 1*h. a:1, b*h")
+        .insert("dbias", "0", "output bias gradient or not")
+        .insert("prec", "fp16", "data type. fp32/fp16/bf16")
+        .insert("mask",
+                "0",
+                "0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
+                "'t', top-left causal mask, 'b', bottom-r causal mask\n"
+                "'t:l,r', top-left sliding window attn(swa) with FA style left right size\n"
+                "'b:l,r', bottom-r sliding window attn(swa) with FA style left right size\n"
+                "'xt:window_size', xformer style masking from top-left, window_size negative is "
+                "causal, positive is swa\n"
+                "'xb:window_size', xformer style masking from bottom-r, window_size negative is "
+                "causal, positive is swa\n"
+                "'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for "
+                "now)")
+        .insert("kname", "0", "if set to 1 will print kernel name")
+        .insert("init",
+                "uf",
+                "init method:\n  ui or 0 - uniform random int\n  uf or 1 - uniform random float"
+                "\n  tf or 2 - trig float")
+        .insert("seed",
+                "11939",
+                "random seed used for initializing input tensors. 0 for "
+                "non-deterministic seed")
+        .insert("p_drop", "0", "0~1 probability of dropout")
+        .insert("drop_seed", "1", "seed for dropout random number generator")
+        .insert("drop_offset", "0", "offset for dropout random number generator")
+        .insert(
+            "drop_prefs",
+            "0",
+            "whether dropout seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("deterministic",
+                "0",
+                "if set to 1 will use multi-buffer reduction strategy for dq, atomic operation "
+                "will not be used")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "fmha_bwd.json", "json file name to dump results");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataTypeConfig>
+auto run(const ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type    = arg_parser.get_str("prec");
+    int do_validation        = arg_parser.get_int("v");
+    mode_enum mode           = static_cast<mode_enum>(arg_parser.get_uint32("mode"));
+    ck_tile::index_t batch   = arg_parser.get_int("b");
+    ck_tile::index_t nhead   = arg_parser.get_int("h");
+    ck_tile::index_t nhead_k = arg_parser.get_int("h_k");
+    auto seqlen_qs           = arg_parser.get_int_vec("s");
+    auto seqlen_ks           = arg_parser.get_int_vec("s_k");
+    ck_tile::index_t hdim_q  = arg_parser.get_int("d");
+    ck_tile::index_t hdim_v  = arg_parser.get_int("d_v");
+    bool i_perm              = arg_parser.get_bool("iperm");
+    bool o_perm              = arg_parser.get_bool("operm");
+    float scale              = arg_parser.get_float("scale");
+    std::string bias_str     = arg_parser.get_str("bias");
+    bool use_dbias           = arg_parser.get_bool("dbias");
+    float p_drop             = arg_parser.get_float("p_drop");
+    uint64_t drop_seed       = arg_parser.get_uint64("drop_seed");
+    uint64_t drop_offset     = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs          = arg_parser.get_bool("drop_prefs");
+    std::string mask_str     = arg_parser.get_str("mask");
+    bool deterministic       = arg_parser.get_bool("deterministic");
+    std::string init_method  = arg_parser.get_str("init");
+    uint32_t seed            = arg_parser.get_uint32("seed");
+
+    ck_tile::stream_config stream_config{nullptr,
+                                         true,
+                                         /* log_level = */ (arg_parser.get_bool("kname") ? 1 : 0),
+                                         arg_parser.get_int("warmup"),
+                                         arg_parser.get_int("repeat"),
+                                         arg_parser.get_str("timer") == std::string("gpu")};
+
+    auto json = arg_parser.get_int("json") == 1
+                    ? std::optional<std::string>{arg_parser.get_str("jsonfile")}
+                    : std::nullopt;
+
+    return fmha_bwd_run<DataTypeConfig>(mode,
+                                        batch,
+                                        nhead,
+                                        nhead_k,
+                                        seqlen_qs,
+                                        seqlen_ks,
+                                        hdim_q,
+                                        hdim_v,
+                                        i_perm,
+                                        o_perm,
+                                        scale,
+                                        bias_str,
+                                        use_dbias,
+                                        p_drop,
+                                        drop_seed,
+                                        drop_offset,
+                                        drop_prefs,
+                                        mask_str,
+                                        deterministic,
+                                        init_method,
+                                        seed,
+                                        do_validation,
+                                        stream_config,
+                                        json);
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, arg_parser] = create_args(argc, argv);
+        if(!result)
+            return -1;
+
+        const std::string data_type = arg_parser.get_str("prec");
+        if(data_type == "fp32")
+        {
+            return run<FmhaBwdFp32>(arg_parser) == bwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp16")
+        {
+            return run<FmhaBwdFp16>(arg_parser) == bwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "bf16")
+        {
+            return run<FmhaBwdBf16>(arg_parser) == bwd_result::success ? 0 : -2;
+        }
+        std::cerr << "Unsupported precision: " << data_type << std::endl;
+        return -1;
+    }
+    catch(const std::invalid_argument& e)
+    {
+        std::cerr << "Invalid argument: " << e.what() << std::endl;
+        return -1;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -2;
+    }
+}
diff --git a/example/ck_tile/01_fmha/example_fmha_fwd.cpp b/example/ck_tile/01_fmha/example_fmha_fwd.cpp
new file mode 100644
index 0000000000..c27a5ce1ae
--- /dev/null
+++ b/example/ck_tile/01_fmha/example_fmha_fwd.cpp
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "fmha_fwd.hpp"
+#include "fmha_fwd_runner.hpp"
+
+#include <string>
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "0:no validation, 2:cpu validation, 2:gpu validation(experimental)")
+        .insert("mode", "0", "kernel mode. 0:batch, 1:group")
+        .insert("b", "2", "batch size")
+        .insert("h", "8", "num of head, for q")
+        .insert("h_k",
+                "-1",
+                "num of head, for k/v, -1 means equal to h\n"
+                "if not equal to h, then this is GQA/MQA case")
+        .insert("s",
+                "3328",
+                "seqlen_q. if group-mode, means the average value of seqlen_q\n"
+                "total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary\n"
+                "also with \"-s=s0,s1,s2...\" comma-separated ints to set seqlen per batch "
+                "(group mode)")
+        .insert("s_k",
+                "-1",
+                "seqlen_k (including new key/value), -1 means equal to s\n"
+                "also with \"-s_k=s0,s1,s2...\" comma-separated ints to set seqlen per batch "
+                "(group mode)")
+        .insert("s_knew",
+                "0",
+                "seqlen_k for new key/value, 0 means not to use this at all; "
+                "-1 to choose s_knew in [1, s] randomly.")
+        .insert("s_qpad",
+                "-1",
+                "seqlen_q stride between 2 batches (group-mode optional).\n"
+                "Provide positive strides per-batch to simulate physical padding on Q.")
+        .insert("s_kpad",
+                "-1",
+                "seqlen_k stride between 2 batches, currently used in group-mode only\n"
+                "for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride\n"
+                "along seqlen, instead of packed, same as xformer kv_padding,\n"
+                "must be greater than or equal to s_k")
+        .insert("d", "128", "head dim for q, k")
+        .insert("d_v", "-1", "head dim for v, -1 means equal to d")
+        .insert("scale_s",
+                "0",
+                "scale factor of S. 0 means equal to 1/sqrt(hdim).\n"
+                "note when squant=1, this value will be modified")
+        .insert("logits_soft_cap", "0", "attention logits soft capping value.")
+        .insert("squant",
+                "auto",
+                "if using static quantization fusion or not. auto: fp8 will default use squant, "
+                "other will not\n"
+                "0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to "
+                "P and O.\n"
+                "calculate scale_s, scale_p, scale_o auto")
+        .insert("iperm",
+                "1",
+                "permute input\n"
+                "if true, will be b*h*s*d, else b*s*h*d")
+        .insert("operm", "1", "permute output")
+        .insert("bias",
+                "n",
+                "n or 0, no bias\n"
+                "e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s\n"
+                "a(libi) or 2, alibi with 1*h. a:1, b*h")
+        .insert("prec", "fp16", "data type. fp32/fp16/bf16/fp8/bf8")
+        .insert("mask",
+                "0",
+                "0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
+                "'t', top-left causal mask, 'b', bottom-r causal mask\n"
+                "'t:l,r', top-left sliding window attn(swa) with FA style left right size\n"
+                "'b:l,r', bottom-r sliding window attn(swa) with FA style left right size\n"
+                "'xt:window_size', xformer style masking from top-left, window_size negative is "
+                "causal, positive is swa\n"
+                "'xb:window_size', xformer style masking from bottom-r, window_size negative is "
+                "causal, positive is swa\n"
+                "'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for "
+                "now)")
+        .insert("vlayout", "r", "r for row-major(seqlen*hdim), c for col-major(hdim*seqlen)")
+        .insert("lse", "0", "0 not store lse, 1 store lse")
+        .insert("kname", "0", "if set to 1 will print kernel name")
+        .insert("init",
+                "uf",
+                "init method:\n  ui or 0 - uniform random int\n  ni - normalized random int"
+                "\n  uf or 1 - uniform random float\n  nf - normalized random float"
+                "\n  tf or 2 - trig float\n")
+        .insert("seed",
+                "11939",
+                "random seed used for initializing input tensors. 0 for "
+                "non-deterministic seed")
+        .insert("p_drop", "0", "0~1 probability of dropout")
+        .insert("drop_seed", "1", "seed for dropout random number generator")
+        .insert("drop_offset", "0", "offset for dropout random number generator")
+        .insert(
+            "drop_prefs",
+            "0",
+            "whether dropout seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert(
+            "rotary_dim", "0", "RoPE rotary dimension. rotary_dim <= 0 means not apply RoPE at all")
+        .insert("rotary_interleaved", "1", "whether to apply interleaved RoPE")
+        .insert("num_splits",
+                "1",
+                "# of splits for key/value. 0 to determine actual number by heuristic")
+        .insert("page_block_size", "0", "paged-kvcache block size. 0 means not use paged-kvcahe")
+        .insert("cache_batch_idx", "0", "whether to use index map to the kvcache")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "fmha_fwd.json", "json file name to dump results")
+        .insert("q_eff_lens",
+                "",
+                "Batch-mode only: per-batch effective seqlen for Q (exclude PAD).\n"
+                "Comma-separated list of length 'b'. If empty, no override.")
+        .insert("kv_eff_lens",
+                "",
+                "Batch-mode only: per-batch effective seqlen for KV (exclude PAD).\n"
+                "Comma-separated list of length 'b'. If empty, no override.");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataTypeConfig>
+auto run(const ck_tile::ArgParser& arg_parser)
+{
+    int do_validation                = arg_parser.get_int("v");
+    mode_enum mode                   = static_cast<mode_enum>(arg_parser.get_uint32("mode"));
+    ck_tile::index_t batch           = arg_parser.get_int("b");
+    ck_tile::index_t nhead           = arg_parser.get_int("h");
+    ck_tile::index_t nhead_k         = arg_parser.get_int("h_k");
+    auto seqlen_qs                   = arg_parser.get_int_vec("s");
+    auto seqlen_ks                   = arg_parser.get_int_vec("s_k");
+    ck_tile::index_t hdim_q          = arg_parser.get_int("d");
+    ck_tile::index_t hdim_v          = arg_parser.get_int("d_v");
+    ck_tile::index_t seqlen_knew     = arg_parser.get_int("s_knew");
+    auto seqlen_kpads                = arg_parser.get_int_vec("s_kpad");
+    auto seqlen_qpads                = arg_parser.get_int_vec("s_qpad");
+    auto q_eff_lens_per_batch        = arg_parser.get_int_vec("q_eff_lens");
+    auto kv_eff_lens_per_batch       = arg_parser.get_int_vec("kv_eff_lens");
+    ck_tile::index_t rotary_dim      = arg_parser.get_int("rotary_dim");
+    bool i_perm                      = arg_parser.get_bool("iperm");
+    bool o_perm                      = arg_parser.get_bool("operm");
+    float scale_s                    = arg_parser.get_float("scale_s");
+    float logits_soft_cap            = arg_parser.get_float("logits_soft_cap");
+    bool is_v_rowmajor               = arg_parser.get_str("vlayout") == "r";
+    bool lse                         = arg_parser.get_bool("lse");
+    ck_tile::index_t page_block_size = arg_parser.get_int("page_block_size");
+    bool use_cache_batch_idx         = arg_parser.get_bool("cache_batch_idx");
+    std::string bias_str             = arg_parser.get_str("bias");
+    float p_drop                     = arg_parser.get_float("p_drop");
+    uint64_t drop_seed               = arg_parser.get_uint64("drop_seed");
+    uint64_t drop_offset             = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs                  = arg_parser.get_bool("drop_prefs");
+    std::string mask_str             = arg_parser.get_str("mask");
+    bool is_rotary_interleaved       = arg_parser.get_bool("rotary_interleaved");
+    ck_tile::index_t num_splits      = arg_parser.get_int("num_splits");
+    std::string init_method          = arg_parser.get_str("init");
+    uint32_t seed                    = arg_parser.get_uint32("seed");
+
+    bool squant = [&]() {
+        if(arg_parser.get_str("squant") == "auto")
+            return std::is_same_v<DataTypeConfig, FmhaFwdFp8>;
+        else
+            return arg_parser.get_bool("squant");
+    }();
+
+    ck_tile::stream_config stream_config{nullptr,
+                                         true,
+                                         /* log_level = */ (arg_parser.get_bool("kname") ? 1 : 0),
+                                         arg_parser.get_int("warmup"),
+                                         arg_parser.get_int("repeat"),
+                                         arg_parser.get_str("timer") == std::string("gpu")};
+
+    auto json = arg_parser.get_int("json") == 1
+                    ? std::optional<std::string>{arg_parser.get_str("jsonfile")}
+                    : std::nullopt;
+
+    return fmha_fwd_run<DataTypeConfig>(mode,
+                                        batch,
+                                        nhead,
+                                        nhead_k,
+                                        seqlen_qs,
+                                        seqlen_ks,
+                                        hdim_q,
+                                        hdim_v,
+                                        seqlen_knew,
+                                        seqlen_qpads,
+                                        seqlen_kpads,
+                                        q_eff_lens_per_batch,
+                                        kv_eff_lens_per_batch,
+                                        rotary_dim,
+                                        i_perm,
+                                        o_perm,
+                                        scale_s,
+                                        logits_soft_cap,
+                                        is_v_rowmajor,
+                                        lse,
+                                        page_block_size,
+                                        use_cache_batch_idx,
+                                        bias_str,
+                                        p_drop,
+                                        drop_seed,
+                                        drop_offset,
+                                        drop_prefs,
+                                        mask_str,
+                                        squant,
+                                        is_rotary_interleaved,
+                                        num_splits,
+                                        init_method,
+                                        seed,
+                                        do_validation,
+                                        stream_config,
+                                        json);
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, arg_parser] = create_args(argc, argv);
+        if(!result)
+            return -1;
+
+        const std::string data_type = arg_parser.get_str("prec");
+        if(data_type == "fp32")
+        {
+            return run<FmhaFwdFp32>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp16")
+        {
+            return run<FmhaFwdFp16>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "bf16")
+        {
+            return run<FmhaFwdBf16>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp8")
+        {
+            return run<FmhaFwdFp8>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp8bf16")
+        {
+            return run<FmhaFwdFp8Bf16>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp8fp32")
+        {
+            return run<FmhaFwdFp8Fp32>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        std::cerr << "Unsupported precision: " << data_type << std::endl;
+        return -1;
+    }
+    catch(const std::invalid_argument& e)
+    {
+        std::cerr << "Invalid argument: " << e.what() << std::endl;
+        return -1;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -2;
+    }
+}
diff --git a/example/ck_tile/01_fmha/example_fmha_fwd_v3.cpp b/example/ck_tile/01_fmha/example_fmha_fwd_v3.cpp
index d2428e5152..7ddb65a2db 100644
--- a/example/ck_tile/01_fmha/example_fmha_fwd_v3.cpp
+++ b/example/ck_tile/01_fmha/example_fmha_fwd_v3.cpp
@@ -45,25 +45,23 @@ auto parse_cmd_args(int argc, char* argv[]) -> std::pair<bool, ck_tile::ArgParse
                 "permute input\n"
                 "if true, will be b*h*s*d, else b*s*h*d")
         .insert("operm", "0", "permute output")
-        .insert("mask",
-                "0",
-                "0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
-                "'t', top-left causal mask, 'b', bottom-r causal mask\n"
-                "'t:l,r', top-left sliding window attn(swa) with FA style left right size\n"
-                "'b:l,r', bottom-r sliding window attn(swa) with FA style left right size\n"
-                "'xt:window_size', xformer style masking from top-left, window_size negative is "
-                "causal, positive is swa\n"
-                "'xb:window_size', xformer style masking from bottom-r, window_size negative is "
-                "causal, positive is swa\n"
-                "'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for "
-                "now)")
+        .insert("causal", "0", "0: no mask, 1: causal mask")
         .insert("v", "1", "0:no verify, 1:verify")
         .insert("seed",
                 "11939",
                 "random seed used for initializing input tensors. 0 for "
                 "non-deterministic seed")
         .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "30", "number of iterations to benchmark the kernel");
+        .insert("repeat", "30", "number of iterations to benchmark the kernel")
+        // Optional effective seqlen override (exclude PAD) for batch mode
+        .insert("q_eff_lens",
+                "",
+                "Batch-mode only: per-batch effective seqlen for Q (exclude PAD).\n"
+                "Comma-separated list of length 'b'. If empty, no override.")
+        .insert("kv_eff_lens",
+                "",
+                "Batch-mode only: per-batch effective seqlen for KV (exclude PAD).\n"
+                "Comma-separated list of length 'b'. If empty, no override.");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_pair(result, arg_parser);
@@ -109,10 +107,21 @@ struct Problem
         softmax_scale = args.get_float("scale_s");
         if(softmax_scale == .0f)
             softmax_scale = 1.0 / ck_tile::sqrt(static_cast<float>(hdim));
-        mask = mask_info::decode(args.get_str("mask"), seqlen_q, seqlen_k);
+
+        const auto is_causal = args.get_bool("causal");
+        if(is_causal)
+        {
+            mask = mask_info::decode("b:-1,0", seqlen_q, seqlen_k);
+        }
+        else
+        {
+            mask = mask_info::decode("0", seqlen_q, seqlen_k);
+        }
 
         input_layout  = args.get_int("iperm") == 1 ? TensorLayout::bhsd : TensorLayout::bshd;
         output_layout = args.get_int("operm") == 1 ? TensorLayout::bhsd : TensorLayout::bshd;
+        q_eff_lens    = args.get_int_vec("q_eff_lens");
+        kv_eff_lens   = args.get_int_vec("kv_eff_lens");
     }
 
     std::vector<ck_tile::index_t> get_query_shape() const
@@ -174,6 +183,8 @@ struct Problem
     mask_info mask;
     TensorLayout input_layout;
     TensorLayout output_layout;
+    std::vector<int> q_eff_lens;
+    std::vector<int> kv_eff_lens;
 };
 
 struct RunConfig
@@ -328,8 +339,10 @@ bool run_impl(const Problem& problem, const RunConfig& run_config)
     q_buf.ToDevice(q.data());
     k_buf.ToDevice(k.data());
     v_buf.ToDevice(v.data());
+    // Ensure output buffer is zero-initialized so padded regions compare cleanly
+    o_buf.SetZero();
 
-    ck_tile::fmha_fwd_v3_args args;
+    ck_tile::fmha_fwd_v3_args args{};
 
     args.data_type     = problem.data_type;
     args.batch         = problem.batch;
@@ -382,6 +395,60 @@ bool run_impl(const Problem& problem, const RunConfig& run_config)
                               : problem.seqlen_q * problem.hdim;
     args.batch_stride_o = problem.seqlen_q * problem.nhead_q * problem.hdim;
 
+    // Optional cumulative seqlen overrides (exclude PAD)
+    const bool has_varlen_q = !problem.q_eff_lens.empty() && problem.q_eff_lens[0] != -1;
+    const bool has_varlen_k = !problem.kv_eff_lens.empty() && problem.kv_eff_lens[0] != -1;
+
+    auto make_effective_vec = [&](const std::vector<int>& opt_vec, ck_tile::index_t fallback) {
+        std::vector<ck_tile::index_t> eff;
+        if(!opt_vec.empty() && opt_vec[0] != -1)
+        {
+            eff.assign(opt_vec.begin(), opt_vec.end());
+            if(eff.size() < static_cast<size_t>(problem.batch))
+            {
+                eff.resize(problem.batch, eff.back());
+            }
+        }
+        else
+        {
+            eff.assign(problem.batch, fallback);
+        }
+        return eff;
+    };
+
+    const auto eff_q_vec  = make_effective_vec(problem.q_eff_lens, problem.seqlen_q);
+    const auto eff_kv_vec = make_effective_vec(problem.kv_eff_lens, problem.seqlen_k);
+
+    // Calculate cumulative sums for kernel arguments if varlen is used
+    std::vector<ck_tile::index_t> cuq_cum, cukv_cum;
+    auto calculate_cumulative = [&](const std::vector<ck_tile::index_t>& per_batch_vec,
+                                    std::vector<ck_tile::index_t>& cum_vec) {
+        cum_vec.resize(per_batch_vec.size() + 1);
+        cum_vec[0] = 0;
+        for(std::size_t i = 0; i < per_batch_vec.size(); ++i)
+            cum_vec[i + 1] = cum_vec[i] + per_batch_vec[i];
+    };
+
+    if(has_varlen_q)
+    {
+        calculate_cumulative(eff_q_vec, cuq_cum);
+    }
+    if(has_varlen_k)
+    {
+        calculate_cumulative(eff_kv_vec, cukv_cum);
+    }
+
+    ck_tile::DeviceMem cuq_buf(!cuq_cum.empty() ? cuq_cum.size() * sizeof(ck_tile::index_t) : 0);
+    ck_tile::DeviceMem cukv_buf(!cukv_cum.empty() ? cukv_cum.size() * sizeof(ck_tile::index_t) : 0);
+    cuq_buf.ToDevice(!cuq_cum.empty() ? cuq_cum.data() : nullptr);
+    cukv_buf.ToDevice(!cukv_cum.empty() ? cukv_cum.data() : nullptr);
+    args.cu_seqlen_q_ptr =
+        !cuq_cum.empty() ? reinterpret_cast<const ck_tile::index_t*>(cuq_buf.GetDeviceBuffer())
+                         : nullptr;
+    args.cu_seqlen_kv_ptr =
+        !cukv_cum.empty() ? reinterpret_cast<const ck_tile::index_t*>(cukv_buf.GetDeviceBuffer())
+                          : nullptr;
+
     ck_tile::stream_config stream_config{nullptr,
                                          true,
                                          /*log_level=*/0,
@@ -444,15 +511,72 @@ bool run_impl(const Problem& problem, const RunConfig& run_config)
         o_ref = o_ref.transpose({0, 2, 1, 3});
     }
 
-    host::fmha_fwd<float, DataType>(q,
-                                    k,
-                                    v,
-                                    problem.mask,
-                                    o_ref,
-                                    ck_tile::identity{},
-                                    ck_tile::identity{},
-                                    ck_tile::identity{},
-                                    ck_tile::scales{problem.softmax_scale});
+    // If variable lengths are provided, compute per-batch references
+    // with the effective lengths; else compute a single full reference.
+    if(has_varlen_q || has_varlen_k)
+    {
+        // Variable-length aware verification: zero-fill padded region and only compute valid part.
+        o_ref.SetZero();
+
+        for(int b = 0; b < problem.batch; ++b)
+        {
+            const ck_tile::index_t seqlen_q_eff  = eff_q_vec[b];
+            const ck_tile::index_t seqlen_kv_eff = eff_kv_vec[b];
+
+            if(seqlen_q_eff <= 0 || seqlen_kv_eff <= 0)
+                continue;
+
+            // Slice current batch from inputs (bshd) and build single-batch tensors
+            ck_tile::HostTensor<DataType> q_b({1, seqlen_q_eff, problem.nhead_q, problem.hdim});
+            ck_tile::HostTensor<DataType> k_b({1, seqlen_kv_eff, problem.nhead_kv, problem.hdim});
+            ck_tile::HostTensor<DataType> v_b({1, seqlen_kv_eff, problem.nhead_kv, problem.hdim});
+            ck_tile::HostTensor<DataType> o_b({1, seqlen_q_eff, problem.nhead_q, problem.hdim});
+
+            // Copy effective region
+            q_b.ForEach([&](auto& self, auto idx) {
+                // idx: [0, s, h, d]
+                self(idx) = q(b, idx[1], idx[2], idx[3]);
+            });
+            k_b.ForEach([&](auto& self, auto idx) { self(idx) = k(b, idx[1], idx[2], idx[3]); });
+            v_b.ForEach([&](auto& self, auto idx) { self(idx) = v(b, idx[1], idx[2], idx[3]); });
+
+            // Compute reference for this batch segment (host::fmha_fwd expects bshd tensors)
+            host::fmha_fwd<float, DataType>(q_b,
+                                            k_b,
+                                            v_b,
+                                            problem.mask,
+                                            o_b,
+                                            ck_tile::identity{},
+                                            ck_tile::identity{},
+                                            ck_tile::identity{},
+                                            ck_tile::scales{problem.softmax_scale});
+
+            // Scatter into o_ref's bshd descriptor memory
+            for(int s = 0; s < seqlen_q_eff; ++s)
+            {
+                for(int h = 0; h < problem.nhead_q; ++h)
+                {
+                    for(int d = 0; d < problem.hdim; ++d)
+                    {
+                        o_ref(b, s, h, d) = o_b(0, s, h, d);
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        // No varlen override: compute the full reference once
+        host::fmha_fwd<float, DataType>(q,
+                                        k,
+                                        v,
+                                        problem.mask,
+                                        o_ref,
+                                        ck_tile::identity{},
+                                        ck_tile::identity{},
+                                        ck_tile::identity{},
+                                        ck_tile::scales{problem.softmax_scale});
+    }
 
     ck_tile::HostTensor<DataType> o(problem.get_output_shape());
     o_buf.FromDevice(o.data());
diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp
deleted file mode 100644
index 9f1e0f6948..0000000000
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ /dev/null
@@ -1,998 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "fmha_bwd.hpp"
-#include "ck_tile/host.hpp"
-#include "mask.hpp"
-#include "utils.hpp"
-
-#include <array>
-#include <cstring>
-#include <functional>
-#include <numeric>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
-{
-    using size_type = typename std::vector<T>::size_type;
-
-    os << "[";
-    for(size_type idx = 0; idx < v.size(); ++idx)
-    {
-        if(0 < idx)
-        {
-            os << ", ";
-        }
-        os << v[idx];
-    }
-    return os << "]";
-}
-
-auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "weather do CPU validation or not")
-        .insert("mode", "0", "kernel mode. 0:batch, 1:group")
-        .insert("b", "2", "batch size")
-        .insert("h", "8", "num of head, for q")
-        .insert("h_k",
-                "-1",
-                "num of head, for k/v, -1 means equal to h\n"
-                "if not equal to h, then this is GQA/MQA case")
-        .insert("s",
-                "3328",
-                "seqlen_q. if group-mode, means the average value of seqlen_q\n"
-                "total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary")
-        .insert("s_k", "-1", "seqlen_k, -1 means equal to s")
-        .insert("d", "128", "head dim for q, k")
-        .insert("d_v", "-1", "head dim for v, -1 means equal to d")
-        .insert("scale", "0", "scale factor. 0 means equal to 1/sqrt(hdim)")
-        .insert("iperm",
-                "1",
-                "permute input\n"
-                "if true, will be b*h*s*d, else b*s*h*d")
-        .insert("operm", "1", "permute output")
-        .insert("bias",
-                "n",
-                "n or 0, no bias\n"
-                "e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s\n"
-                "a(libi) or 2, alibi with 1*h. a:1, b*h")
-        .insert("dbias", "0", "output bias gradient or not")
-        .insert("prec", "fp16", "data type. fp16 or bf16")
-        .insert("mask",
-                "0",
-                "0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
-                "'t', top-left causal mask, 'b', bottom-r causal mask\n"
-                "'t:l,r', top-left sliding window attn(swa) with FA style left right size\n"
-                "'b:l,r', bottom-r sliding window attn(swa) with FA style left right size\n"
-                "'xt:window_size', xformer style masking from top-left, window_size negative is "
-                "causal, positive is swa\n"
-                "'xb:window_size', xformer style masking from bottom-r, window_size negative is "
-                "causal, positive is swa\n"
-                "'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for "
-                "now)")
-        .insert("kname", "0", "if set to 1 will print kernel name")
-        .insert("init", "1", "init method. 0:random int, 1:random float, 2:trig float")
-        .insert("seed",
-                "11939",
-                "random seed used for initializing input tensors. 0 for "
-                "non-deterministic seed")
-        .insert("p_drop", "0", "0~1 probability of dropout")
-        .insert("drop_seed", "1", "seed for random number generator")
-        .insert("drop_offset", "0", "offset for random number generator")
-        .insert("drop_prefs",
-                "0",
-                "seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
-        .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel")
-        .insert("deterministic",
-                "0",
-                "if set to 1 will use multi-buffer reduction strategy for dq, atomic opeartion "
-                "will not be used");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
-// different threshold for different dtype
-template <typename DataTypeConfig>
-auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/)
-{
-    double rtol = 1e-2;
-    double atol = 1e-2;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<FmhaBwdBf16>(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v)
-{
-    double rtol = 1e-2;
-    double atol = 1e-2;
-    if(hdim_q > 128 && hdim_v > 128) // 3.2 for RTZ/1.5 for RTN
-    {
-        rtol = 3.2e-2;
-        atol = 3.2e-2;
-    }
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <typename DataTypeConfig>
-bool run(const ck_tile::ArgParser& arg_parser)
-{
-    std::string data_type    = arg_parser.get_str("prec");
-    int do_validation        = arg_parser.get_int("v");
-    auto mode                = static_cast<mode_enum>(arg_parser.get_uint32("mode"));
-    ck_tile::index_t batch   = arg_parser.get_int("b");
-    ck_tile::index_t nhead   = arg_parser.get_int("h");
-    ck_tile::index_t nhead_k = arg_parser.get_int("h_k");
-    if(nhead_k < 0)
-        nhead_k = nhead;
-
-    if(nhead % nhead_k != 0)
-    {
-        std::cerr << "nhead:" << nhead << " must be multiple of nhead_k:" << nhead_k << std::endl;
-        return false;
-    }
-
-    ck_tile::index_t seqlen_q = arg_parser.get_int("s");
-    ck_tile::index_t seqlen_k = arg_parser.get_int("s_k");
-    if(seqlen_k < 0)
-        seqlen_k = seqlen_q;
-    ck_tile::index_t hdim_q = arg_parser.get_int("d");
-    ck_tile::index_t hdim_v = arg_parser.get_int("d_v");
-    if(hdim_v < 0)
-        hdim_v = hdim_q;
-
-    bool i_perm = arg_parser.get_bool("iperm"); // if true, will be batch * nhead * seqlen * hdim
-    bool o_perm = arg_parser.get_bool("operm"); // if false, will be batch * seqlen * nhead * hdim
-
-    float scale = arg_parser.get_float("scale");
-    if(scale == .0f)
-        scale = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q));
-
-    bias_info bias       = bias_info::decode(arg_parser.get_str("bias"));
-    bool use_dbias       = arg_parser.get_bool("dbias");
-    float p_drop         = arg_parser.get_float("p_drop");
-    uint64_t drop_seed   = arg_parser.get_uint64("drop_seed");
-    uint64_t drop_offset = arg_parser.get_uint64("drop_offset");
-    bool drop_prefs      = arg_parser.get_bool("drop_prefs");
-
-    if(use_dbias && bias.type != bias_enum::elementwise_bias)
-    {
-        std::cerr << "dbias only exists when bias type is elementwise" << std::endl;
-        return false;
-    }
-
-    if(p_drop < 0.0f || p_drop > 1.0f)
-    {
-        std::cerr << "The value of p_drop should be 0~1" << std::endl;
-        return false;
-    }
-    float p_undrop = 1.0 - p_drop;
-    uint8_t p_undrop_in_uint8_t =
-        uint8_t(std::floor(p_undrop * std::numeric_limits<uint8_t>::max()));
-    float rp_undrop = 1.0 / p_undrop;
-
-    bool s_randval = false;
-    if(p_drop > 0.0f && do_validation)
-    {
-        s_randval = true;
-    }
-
-    mask_info mask = mask_info::decode(arg_parser.get_str("mask"), seqlen_q, seqlen_k);
-
-    int init_method              = arg_parser.get_int("init");
-    std::optional<uint32_t> seed = arg_parser.get_uint32("seed");
-    if(*seed == 0)
-    {
-        seed.reset();
-    }
-
-    int stream_warmup  = arg_parser.get_int("warmup");
-    int stream_repeat  = arg_parser.get_int("repeat");
-    bool kname         = arg_parser.get_bool("kname");
-    bool deterministic = arg_parser.get_bool("deterministic");
-
-    ck_tile::stream_config stream_config{nullptr,
-                                         true,
-                                         /* log_level = */ (kname ? 1 : 0),
-                                         stream_warmup,
-                                         stream_repeat,
-                                         arg_parser.get_str("timer") == std::string("gpu")};
-
-    const auto seqstart_q_host = generate_seqstarts(mode, batch, seqlen_q);
-    const auto seqstart_k_host = generate_seqstarts(mode, batch, seqlen_k);
-
-    using TypeConfig = FmhaBwdTypeConfig<DataTypeConfig>;
-
-    using QDataType             = typename TypeConfig::QDataType;
-    using KDataType             = typename TypeConfig::KDataType;
-    using VDataType             = typename TypeConfig::VDataType;
-    using GemmDataType          = typename TypeConfig::GemmDataType;
-    using BiasDataType          = typename TypeConfig::BiasDataType;
-    using LSEDataType           = typename TypeConfig::LSEDataType;
-    using AccDataType           = typename TypeConfig::AccDataType;
-    using DDataType             = typename TypeConfig::DDataType;
-    using RandValOutputDataType = typename TypeConfig::RandValOutputDataType;
-    using ODataType             = typename TypeConfig::ODataType;
-    using OGradDataType         = typename TypeConfig::OGradDataType;
-    using QGradDataType         = typename TypeConfig::QGradDataType;
-    using KGradDataType         = typename TypeConfig::KGradDataType;
-    using VGradDataType         = typename TypeConfig::VGradDataType;
-    using BiasGradDataType      = typename TypeConfig::BiasGradDataType;
-
-    // accumulation numbers for performance evaluation
-    std::size_t flop = 0, num_byte = 0;
-    auto max_seqlen_q =
-        std::numeric_limits<int32_t>::min(); // we will use max seqlen to decide grid size
-    auto max_seqlen_k =
-        std::numeric_limits<int32_t>::min(); // we will use max seqlen to decide grid size
-    {
-        for(ck_tile::index_t wb = 0; wb < batch; ++wb)
-        {
-            const int32_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
-            const int32_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
-
-            if(max_seqlen_q < real_seqlen_q)
-            {
-                max_seqlen_q = real_seqlen_q;
-            }
-
-            if(max_seqlen_k < real_seqlen_k)
-            {
-                max_seqlen_k = real_seqlen_k;
-            }
-
-            flop += nhead * (static_cast<std::size_t>(3) * static_cast<std::size_t>(2) *
-                                 real_seqlen_q * real_seqlen_k * hdim_q + // Q@K/dS^T@Q^T/dS@K^T
-                             static_cast<std::size_t>(2) * static_cast<std::size_t>(2) *
-                                 real_seqlen_q * real_seqlen_k * hdim_v); // dO@V/P^T@dO^T
-
-            num_byte += nhead * (sizeof(QDataType) * real_seqlen_q * hdim_q +
-                                 sizeof(KDataType) * real_seqlen_k * hdim_q +
-                                 sizeof(VDataType) * real_seqlen_k * hdim_v +
-                                 sizeof(ODataType) * real_seqlen_q * hdim_v +
-                                 sizeof(OGradDataType) * real_seqlen_q * hdim_v +
-                                 sizeof(QGradDataType) * real_seqlen_q * hdim_q +
-                                 sizeof(KGradDataType) * real_seqlen_k * hdim_q +
-                                 sizeof(VGradDataType) * real_seqlen_k * hdim_v +
-                                 sizeof(LSEDataType) * real_seqlen_q);
-        }
-    }
-
-    auto get_lengths = [&](bool permute,
-                           ck_tile::index_t b /*batch*/,
-                           ck_tile::index_t h /*nhead*/,
-                           ck_tile::index_t s /*seqlen*/,
-                           ck_tile::index_t d /*hdim*/) {
-        if(permute)
-            return std::array<ck_tile::index_t, 4>{b, h, s, d};
-        else
-            return std::array<ck_tile::index_t, 4>{b, s, h, d};
-    };
-
-    // host memory for storing all the tensor elements
-    const ck_tile::index_t shape_batch = (mode == mode_enum::batch ? batch : 1);
-    const ck_tile::index_t shape_seqlen_q =
-        (mode == mode_enum::batch ? seqlen_q : seqstart_q_host.back());
-    const ck_tile::index_t shape_seqlen_k =
-        (mode == mode_enum::batch ? seqlen_k : seqstart_k_host.back());
-    const ck_tile::index_t kN0 = (hdim_q <= 128) ? 128 : 64;
-    const ck_tile::index_t nsplits =
-        deterministic ? ck_tile::integer_divide_ceil(max_seqlen_k, kN0) : 1;
-
-    ck_tile::HostTensor<QDataType> q_host(
-        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, hdim_q));
-    ck_tile::HostTensor<KDataType> k_host(
-        get_lengths(i_perm, shape_batch, nhead_k, shape_seqlen_k, hdim_q));
-    ck_tile::HostTensor<VDataType> v_host(
-        get_lengths(i_perm, shape_batch, nhead_k, shape_seqlen_k, hdim_v));
-    ck_tile::HostTensor<BiasDataType> bias_host(
-        bias.type == bias_enum::elementwise_bias
-            ? get_lengths(i_perm, 1, 1, shape_seqlen_q, max_seqlen_k)
-            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
-    ck_tile::HostTensor<AccDataType> alibi_slope_host(
-        bias.type == bias_enum::alibi
-            ? (bias.rank_info == 0 ? std::array<ck_tile::index_t, 2>{1, nhead}
-                                   : std::array<ck_tile::index_t, 2>{batch, nhead})
-            : std::array<ck_tile::index_t, 2>{1, 1});
-    ck_tile::HostTensor<ODataType> o_host(
-        get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
-    ck_tile::HostTensor<LSEDataType> lse_host(
-        std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
-    ck_tile::HostTensor<DDataType> d_host(
-        std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
-    ck_tile::HostTensor<RandValOutputDataType> randval_host(
-        p_drop > 0 ? get_lengths(true, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
-                   : std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
-    ck_tile::HostTensor<QGradDataType> dq_host(
-        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, hdim_q));
-    ck_tile::HostTensor<KGradDataType> dk_host(
-        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_k, hdim_q));
-    ck_tile::HostTensor<VGradDataType> dv_host(
-        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_k, hdim_v));
-    ck_tile::HostTensor<OGradDataType> do_host(
-        get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
-    ck_tile::HostTensor<BiasGradDataType> dbias_host(
-        use_dbias
-            ? get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
-            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
-    ck_tile::HostTensor<AccDataType> dq_acc_host(
-        i_perm
-            ? std::array<ck_tile::index_t, 5>{nsplits, shape_batch, nhead, shape_seqlen_q, hdim_q}
-            : std::array<ck_tile::index_t, 5>{nsplits, shape_batch, shape_seqlen_q, nhead, hdim_q});
-
-    if(init_method == 0)
-    {
-        ck_tile::FillUniformDistributionIntegerValue<QDataType>{-2.f, 2.f, seed}(q_host);
-        ck_tile::FillUniformDistributionIntegerValue<KDataType>{-2.f, 2.f, seed}(k_host);
-        ck_tile::FillUniformDistributionIntegerValue<VDataType>{-2.f, 2.f, seed}(v_host);
-        ck_tile::FillUniformDistributionIntegerValue<BiasDataType>{-2.f, 2.f, seed}(bias_host);
-        ck_tile::FillUniformDistributionIntegerValue<OGradDataType>{-2.f, 2.f, seed}(do_host);
-    }
-    else if(init_method == 1)
-    {
-        ck_tile::FillUniformDistribution<QDataType>{0.f, 1.f, seed}(q_host);
-        ck_tile::FillUniformDistribution<KDataType>{0.f, 1.f, seed}(k_host);
-        ck_tile::FillUniformDistribution<VDataType>{0.f, 1.f, seed}(v_host);
-        ck_tile::FillUniformDistribution<BiasDataType>{0.f, 1.f, seed}(bias_host);
-        ck_tile::FillUniformDistribution<OGradDataType>{0.f, 1.f, seed}(do_host);
-    }
-    else if(init_method == 2)
-    {
-        ck_tile::FillTrigValue<QDataType>{}(q_host);
-        ck_tile::FillTrigValue<KDataType>{}(k_host);
-        ck_tile::FillTrigValue<VDataType>{}(v_host);
-        ck_tile::FillTrigValue<BiasDataType>{}(bias_host);
-        ck_tile::FillTrigValue<OGradDataType>{}(do_host);
-    }
-    if(bias.type == bias_enum::alibi)
-    {
-        auto slopes = ck_tile::get_alibi_slopes<AccDataType>(nhead);
-        assert(slopes.size() == static_cast<decltype(slopes.size())>(nhead));
-        if(bias.rank_info == 0)
-        {
-            // alibi in 1*h
-            std::copy(slopes.begin(), slopes.end(), alibi_slope_host.begin());
-        }
-        else
-        {
-            // alibi in b*h
-            for(auto i_b = 0; i_b < batch; i_b++)
-            {
-                std::copy(slopes.begin(), slopes.end(), alibi_slope_host.begin() + i_b * nhead);
-            }
-        }
-    }
-
-    ck_tile::DeviceMem q_buf(q_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem k_buf(k_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem v_buf(v_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem bias_buf(bias_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem lse_buf(lse_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem d_buf(d_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem randval_buf(randval_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem dq_buf(dq_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem dk_buf(dk_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem dv_buf(dv_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem do_buf(do_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem dbias_buf(dbias_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
-    ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
-    ck_tile::DeviceMem drop_seed_buf(drop_prefs ? sizeof(uint64_t) : 0);
-    ck_tile::DeviceMem drop_offset_buf(drop_prefs ? sizeof(uint64_t) : 0);
-    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem dq_acc_buf(dq_acc_host.get_element_space_size_in_bytes());
-
-    q_buf.ToDevice(q_host.data());
-    k_buf.ToDevice(k_host.data());
-    v_buf.ToDevice(v_host.data());
-    bias_buf.ToDevice(bias_host.data());
-    do_buf.ToDevice(do_host.data());
-    seqstart_q.ToDevice(seqstart_q_host.data());
-    seqstart_k.ToDevice(seqstart_k_host.data());
-    drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
-    drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
-    alibi_slope_buf.ToDevice(alibi_slope_host.data());
-
-    // clang-format off
-    auto layout_str = [&](bool permute){
-        if (permute) return std::string("bhsd");
-        else return std::string("bshd");
-    };
-    auto io_layout = [&](bool iperm_, bool operm_) {
-        if (iperm_ == operm_) return layout_str(iperm_);
-        else return layout_str(iperm_) + std::string("-") + layout_str(operm_);
-    };
-    // clang-format on
-    const std::string prec = arg_parser.get_str("prec");
-
-    std::cout << "[" << prec << "|" << mode << "|" << io_layout(i_perm, o_perm) << "] b:" << batch
-              << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_q << "/" << seqlen_k
-              << ", d:" << hdim_q << "/" << hdim_v << ", scale:" << scale << ", bias:" << bias
-              << ", dbias:" << use_dbias << ", p_drop:" << p_drop << ", s_randval:" << s_randval
-              << ", deterministic:" << deterministic << ", mask:" << mask << std::flush;
-
-    std::size_t workspace_size =
-        dq_acc_host.get_element_space_size_in_bytes() * sizeof(AccDataType) / (1024 * 1024);
-
-    if(deterministic == 1)
-    {
-        std::cout << "\nDeterministic mode ON: " << workspace_size
-                  << " MByte memory workspace allocated" << std::endl;
-    }
-
-    auto fmha_traits = fmha_bwd_traits{hdim_q,
-                                       hdim_v,
-                                       data_type,
-                                       mode == mode_enum::group,
-                                       mask.type,
-                                       bias.type,
-                                       use_dbias,
-                                       p_drop > 0.0f,
-                                       s_randval,
-                                       deterministic};
-    auto fmha_args   = [&]() {
-        assert(nhead % nhead_k == 0);
-        /// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q,
-        ///       seqlen_k] in this example, hence both the 'batch_stride_bias' &
-        ///       'nhead_stride_bias' are 0.
-        // setup stride_* arguments
-        const ck_tile::index_t stride_q       = (i_perm ? hdim_q : nhead * hdim_q);
-        const ck_tile::index_t stride_k       = (i_perm ? hdim_q : nhead_k * hdim_q);
-        const ck_tile::index_t stride_v       = (i_perm ? hdim_v : nhead_k * hdim_v);
-        const ck_tile::index_t stride_bias    = (max_seqlen_k);
-        const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
-        const ck_tile::index_t stride_randval = (max_seqlen_k);
-        const ck_tile::index_t stride_do      = (o_perm ? hdim_v : nhead * hdim_v);
-        const ck_tile::index_t stride_dk      = (i_perm ? hdim_q : nhead * hdim_q);
-        const ck_tile::index_t stride_dv      = (i_perm ? hdim_v : nhead * hdim_v);
-        const ck_tile::index_t stride_dbias   = (i_perm ? max_seqlen_k : nhead * max_seqlen_k);
-        // setup nhead_stride_* arguments
-        const ck_tile::index_t nhead_stride_q       = (i_perm ? shape_seqlen_q * hdim_q : hdim_q);
-        const ck_tile::index_t nhead_stride_k       = (i_perm ? shape_seqlen_k * hdim_q : hdim_q);
-        const ck_tile::index_t nhead_stride_v       = (i_perm ? shape_seqlen_k * hdim_v : hdim_v);
-        const ck_tile::index_t nhead_stride_bias    = 0;
-        const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
-        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t nhead_stride_do      = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
-        const ck_tile::index_t nhead_stride_lsed    = shape_seqlen_q;
-        const ck_tile::index_t nhead_stride_dbias =
-            (i_perm ? shape_seqlen_q * max_seqlen_k : max_seqlen_k);
-        // setup batch_stride_* arguments
-        const ck_tile::index_t batch_stride_q       = (nhead * shape_seqlen_q * hdim_q);
-        const ck_tile::index_t batch_stride_k       = (nhead_k * shape_seqlen_k * hdim_q);
-        const ck_tile::index_t batch_stride_v       = (nhead_k * shape_seqlen_k * hdim_v);
-        const ck_tile::index_t batch_stride_bias    = 0;
-        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
-        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t batch_stride_do      = (nhead * shape_seqlen_q * hdim_v);
-        const ck_tile::index_t batch_stride_lsed    = (nhead * shape_seqlen_q);
-        const ck_tile::index_t batch_stride_dk      = (nhead * shape_seqlen_k * hdim_q);
-        const ck_tile::index_t batch_stride_dv      = (nhead * shape_seqlen_k * hdim_v);
-        const ck_tile::index_t batch_stride_dbias   = (nhead * shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t split_stride_dq_acc =
-            (shape_batch * nhead * shape_seqlen_q * hdim_q);
-
-        const auto drop_seed_offset = [&]() -> decltype(fmha_bwd_args::drop_seed_offset) {
-            if(drop_prefs)
-            {
-                return std::make_pair(drop_seed_buf.GetDeviceBuffer(),
-                                      drop_offset_buf.GetDeviceBuffer());
-            }
-            else
-            {
-                return std::make_pair(drop_seed, drop_offset);
-            }
-        }();
-
-        return fmha_bwd_args{q_buf.GetDeviceBuffer(),
-                             k_buf.GetDeviceBuffer(),
-                             v_buf.GetDeviceBuffer(),
-                             bias.type == bias_enum::alibi ? alibi_slope_buf.GetDeviceBuffer()
-                                                             : bias_buf.GetDeviceBuffer(),
-                             o_buf.GetDeviceBuffer(),
-                             lse_buf.GetDeviceBuffer(),
-                             do_buf.GetDeviceBuffer(),
-                             d_buf.GetDeviceBuffer(),
-                             randval_buf.GetDeviceBuffer(),
-                             dq_buf.GetDeviceBuffer(),
-                             dk_buf.GetDeviceBuffer(),
-                             dv_buf.GetDeviceBuffer(),
-                             dbias_buf.GetDeviceBuffer(),
-                             dq_acc_buf.GetDeviceBuffer(),
-                             seqstart_q.GetDeviceBuffer(),
-                             seqstart_k.GetDeviceBuffer(),
-                             nullptr,
-                             shape_seqlen_q,
-                             shape_seqlen_k,
-                             batch,
-                             max_seqlen_q,
-                             max_seqlen_k,
-                             hdim_q,
-                             hdim_v,
-                             nhead,
-                             nhead_k,
-                             scale,
-                             stride_q,
-                             stride_k,
-                             stride_v,
-                             bias.type == bias_enum::alibi ? (bias.rank_info == 0 ? 0 : nhead)
-                                                             : stride_bias,
-                             stride_o,
-                             stride_randval,
-                             stride_do,
-                             stride_q, // stride_dq_acc
-                             stride_q, // stride_dq
-                             stride_dk,
-                             stride_dv,
-                             stride_dbias,
-                             nhead_stride_q,
-                             nhead_stride_k,
-                             nhead_stride_v,
-                             nhead_stride_bias,
-                             nhead_stride_o,
-                             nhead_stride_randval,
-                             nhead_stride_do,
-                             nhead_stride_lsed,
-                             nhead_stride_q, // nhead_stride_dq_acc
-                             nhead_stride_q, // nhead_stride_dq
-                             nhead_stride_k, // nhead_stride_dk
-                             nhead_stride_v, // nhead_stride_dv
-                             nhead_stride_dbias,
-                             batch_stride_q,
-                             batch_stride_k,
-                             batch_stride_v,
-                             batch_stride_bias,
-                             batch_stride_o,
-                             batch_stride_randval,
-                             batch_stride_do,
-                             batch_stride_lsed,
-                             batch_stride_q, // batch_stride_dq_acc
-                             batch_stride_q, // batch_stride_dq
-                             batch_stride_dk,
-                             batch_stride_dv,
-                             batch_stride_dbias,
-                             split_stride_dq_acc,
-                             mask.left,
-                             mask.right,
-                             static_cast<ck_tile::index_t>(mask.type),
-                             p_drop,
-                             p_undrop,
-                             drop_seed_offset};
-    }();
-
-    float ave_time = fmha_bwd(fmha_traits, fmha_args, stream_config);
-    if(ave_time < 0)
-    {
-        std::cout << ", not supported yet" << std::flush << std::endl;
-        return false;
-    }
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
-              << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
-              << " GB/s" << std::flush;
-
-    if(!do_validation)
-    {
-        std::cout << std::flush << std::endl;
-        return true;
-    }
-
-    bool pass = true;
-
-    std::vector<ck_tile::HostTensor<QDataType>> q_host_refs;
-    std::vector<ck_tile::HostTensor<KDataType>> k_host_refs;
-    std::vector<ck_tile::HostTensor<VDataType>> v_host_refs;
-    std::vector<ck_tile::HostTensor<ODataType>> o_host_refs;
-    std::vector<ck_tile::HostTensor<RandValOutputDataType>> randval_host_refs;
-    std::vector<ck_tile::HostTensor<AccDataType>> p_hp_host_refs;
-    std::vector<ck_tile::HostTensor<GemmDataType>> p_lp_host_refs;
-
-    randval_buf.FromDevice(randval_host.data());
-
-    for(ck_tile::index_t wb = 0; wb < batch; ++wb)
-    {
-        const ck_tile::index_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
-        const ck_tile::index_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
-
-        // adjust matrix index according to the mode
-        const ck_tile::index_t b            = (mode == mode_enum::batch ? wb : 0);
-        const ck_tile::index_t query_offset = (mode == mode_enum::batch ? 0 : seqstart_q_host[wb]);
-        const ck_tile::index_t key_offset   = (mode == mode_enum::batch ? 0 : seqstart_k_host[wb]);
-
-        ck_tile::HostTensor<QDataType> q_host_ref({nhead, real_seqlen_q, hdim_q}); // q_g_m_k
-        ck_tile::HostTensor<KDataType> k_host_ref({nhead, real_seqlen_k, hdim_q}); // k_g_n_k
-        ck_tile::HostTensor<VDataType> v_host_ref({nhead, hdim_v, real_seqlen_k}); // v_g_o_n
-        ck_tile::HostTensor<ODataType> o_host_ref({nhead, real_seqlen_q, hdim_v}); // o_g_m_o
-        ck_tile::HostTensor<LSEDataType> lse_host_ref({nhead, real_seqlen_q});     // lse_g_m
-        ck_tile::HostTensor<RandValOutputDataType> randval_host_ref(
-            {nhead, real_seqlen_q, real_seqlen_k}); // randval_g_m_n
-        ck_tile::HostTensor<AccDataType> s_host_ref(
-            {nhead, real_seqlen_q, real_seqlen_k}); // s_g_m_n
-        ck_tile::HostTensor<AccDataType> p_hp_host_ref(
-            {nhead, real_seqlen_q, real_seqlen_k}); // p_hp_g_m_n high precision
-        ck_tile::HostTensor<AccDataType> p_dropped_hp_host_ref(
-            {nhead, real_seqlen_q, real_seqlen_k}); // p_dropped_hp_g_m_n high precision
-        ck_tile::HostTensor<GemmDataType> p_lp_host_ref(
-            {nhead, real_seqlen_q, real_seqlen_k}); // p_lp_g_m_n low precision
-
-        ck_tile::index_t nr = nhead / nhead_k;
-
-        // clang-format off
-        // permute
-        if(i_perm) q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b, i[0], i[1] + query_offset, i[2]); });
-        else       q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b, i[1] + query_offset, i[0], i[2]); });
-
-        if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(b, i[0] / nr, i[1] + key_offset, i[2]); });
-        else       k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(b, i[1] + key_offset, i[0] / nr, i[2]); });
-
-        // v_host_ref: [nhead, hdim, seq], v_host: [b, h_k, s, d]
-        if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[0] / nr, i[2] + key_offset, i[1]); });
-        // v_host_ref: [nhead, hdim, seq], v_host: [b, s, h_k, d]
-        else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[2] + key_offset, i[0] / nr, i[1]); });
-        // clang-format on
-
-        // reference
-        // S = scale * Q * K^T
-        ck_tile::reference_batched_gemm<QDataType, KDataType, AccDataType, AccDataType>(
-            q_host_ref,
-            k_host_ref,
-            s_host_ref,
-            ck_tile::identity{},
-            ck_tile::identity{},
-            ck_tile::scales(scale)); // s_g_m_n = scale * q_g_m_k@k_g_n_k
-
-        if(bias.type == bias_enum::elementwise_bias)
-        {
-            // elementwise bias
-            ck_tile::HostTensor<BiasDataType> bias_host_ref({1, real_seqlen_q, real_seqlen_k});
-            // clang-format off
-            if(i_perm)
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2]); });
-            else
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2]); });
-            // clang-format on
-
-            // broadcast from [1, real_seqlen_q, real_seqlen_k] to [nhead, real_seqlen_q,
-            // real_seqlen_k]
-            ck_tile::
-                reference_batched_elementwise<AccDataType, BiasDataType, AccDataType, AccDataType>(
-                    s_host_ref, bias_host_ref, s_host_ref);
-        }
-        else if(bias.type == bias_enum::alibi)
-        {
-            // alibi construct elementwise bias to verify
-            auto alibi_host = [&]() {
-                if(mask.type != mask_enum::no_mask)
-                {
-                    return ck_tile::make_alibi_from_lr_mask<AccDataType, false>(
-                        0,
-                        mask.left,
-                        mask.right,
-                        real_seqlen_q,
-                        real_seqlen_k,
-                        static_cast<ck_tile::GenericAttentionMaskEnum>(mask.type));
-                }
-                else
-                {
-                    return ck_tile::Alibi<AccDataType, false>{
-                        0, real_seqlen_q, real_seqlen_k, ck_tile::AlibiMode::FROM_BOTTOM_RIGHT};
-                }
-            }();
-
-            ck_tile::HostTensor<AccDataType> alibi_bias_host_ref(
-                {nhead, real_seqlen_q, real_seqlen_k});
-            auto i_b_slope = bias.rank_info == 0 ? 0 : wb;
-            for(auto i_h = 0; i_h < nhead; i_h++)
-            {
-                AccDataType current_slope = alibi_slope_host(i_b_slope, i_h);
-                alibi_host.slope = alibi_host.mode == ck_tile::AlibiMode::VERTICAL ? current_slope
-                                                                                   : -current_slope;
-                for(auto i_r = 0; i_r < real_seqlen_q; i_r++)
-                {
-                    for(auto i_c = 0; i_c < real_seqlen_k; i_c++)
-                    {
-                        AccDataType pixel = 0;
-                        alibi_host.update(pixel, i_r, i_c);
-                        alibi_bias_host_ref(i_h, i_r, i_c) = pixel;
-                    }
-                }
-            }
-            // [nhead, real_seqlen_q, real_seqlen_k]
-            ck_tile::
-                reference_batched_elementwise<AccDataType, AccDataType, AccDataType, AccDataType>(
-                    s_host_ref, alibi_bias_host_ref, s_host_ref);
-        }
-
-        if(mask.type == mask_enum::no_mask)
-        {
-            ck_tile::reference_batched_masking<AccDataType>(
-                s_host_ref, FmhaMasks::NoMask{real_seqlen_q, real_seqlen_k});
-        }
-        else if(mask.type == mask_enum::window_generic)
-        {
-            ck_tile::reference_batched_masking<AccDataType>(
-                s_host_ref,
-                ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::GenericMask>(
-                    mask.left, mask.right, real_seqlen_q, real_seqlen_k));
-        }
-        else
-        {
-            // if left window size is negative, means causal
-            // else means generic (for current batch)
-            if(mask.left < 0)
-                ck_tile::reference_batched_masking<AccDataType>(
-                    s_host_ref,
-                    ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::CausalMask>(
-                        mask.left,
-                        mask.right,
-                        real_seqlen_q,
-                        real_seqlen_k,
-                        mask.type == mask_enum::mask_top_left));
-            else
-                ck_tile::reference_batched_masking<AccDataType>(
-                    s_host_ref,
-                    ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::GenericMask>(
-                        mask.left,
-                        mask.right,
-                        real_seqlen_q,
-                        real_seqlen_k,
-                        mask.type == mask_enum::mask_top_left));
-        }
-        ck_tile::reference_batched_softmax<AccDataType, LSEDataType, AccDataType>(
-            s_host_ref, p_hp_host_ref, ck_tile::identity{}, lse_host_ref);
-
-        if(p_drop > 0)
-        {
-            p_dropped_hp_host_ref = p_hp_host_ref;
-            randval_host_ref.ForEach([&](auto& self, auto idx) {
-                self(idx) = randval_host(b, idx[0], idx[1] + query_offset, idx[2]);
-            });
-            ck_tile::reference_batched_dropout(
-                p_dropped_hp_host_ref, randval_host_ref, p_undrop_in_uint8_t, rp_undrop);
-            p_lp_host_ref = p_dropped_hp_host_ref.template CopyAsType<GemmDataType>();
-        }
-        else
-        {
-            p_lp_host_ref = p_hp_host_ref.template CopyAsType<GemmDataType>();
-        }
-
-        // O = P * V
-        ck_tile::reference_batched_gemm<GemmDataType, VDataType, AccDataType, ODataType>(
-            p_lp_host_ref, v_host_ref, o_host_ref); // o_g_m_o = p_lp_g_m_n@v_g_o_n
-
-        // clang-format off
-        // permute
-        if(o_perm) o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[0], idx[1] + query_offset, idx[2]) = self(idx); });
-        else       o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[1] + query_offset, idx[0], idx[2]) = self(idx); });
-
-        lse_host_ref.ForEach([&](auto& self, auto idx) { lse_host(b, idx[0], idx[1] + query_offset) = self(idx); });
-        // clang-format on
-
-        q_host_refs.push_back(q_host_ref);
-        k_host_refs.push_back(k_host_ref);
-        v_host_refs.push_back(v_host_ref);
-        o_host_refs.push_back(o_host_ref);
-        p_hp_host_refs.push_back(p_hp_host_ref);
-        p_lp_host_refs.push_back(p_lp_host_ref);
-        if(p_drop > 0)
-        {
-            randval_host_refs.push_back(randval_host_ref);
-        }
-    }
-
-    // set to bad values to check if the kernel writes to these buffers
-    ck_tile::FillConstant<QGradDataType>{ck_tile::numeric<QGradDataType>::infinity()}(dq_host);
-    ck_tile::FillConstant<KGradDataType>{ck_tile::numeric<KGradDataType>::infinity()}(dk_host);
-    ck_tile::FillConstant<VGradDataType>{ck_tile::numeric<VGradDataType>::infinity()}(dv_host);
-    dq_buf.ToDevice(dq_host.data());
-    dk_buf.ToDevice(dk_host.data());
-    dv_buf.ToDevice(dv_host.data());
-
-    o_buf.ToDevice(o_host.data());
-    lse_buf.ToDevice(lse_host.data());
-    dq_buf.SetZero();
-    dbias_buf.SetZero();
-    dq_acc_buf.SetZero();
-
-    ck_tile::stream_config stream_config_v{
-        nullptr, true, 0, 0, 1, arg_parser.get_str("timer") == std::string("gpu")};
-    fmha_bwd(fmha_traits, fmha_args, stream_config_v);
-
-    dq_buf.FromDevice(dq_host.data());
-    dk_buf.FromDevice(dk_host.data());
-    dv_buf.FromDevice(dv_host.data());
-    dbias_buf.FromDevice(dbias_host.data());
-
-    for(ck_tile::index_t wb = 0; wb < batch; ++wb)
-    {
-        const ck_tile::index_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
-        const ck_tile::index_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
-
-        // adjust matrix index according to the mode
-        const ck_tile::index_t b            = (mode == mode_enum::batch ? wb : 0);
-        const ck_tile::index_t query_offset = (mode == mode_enum::batch ? 0 : seqstart_q_host[wb]);
-        const ck_tile::index_t key_offset   = (mode == mode_enum::batch ? 0 : seqstart_k_host[wb]);
-
-        ck_tile::HostTensor<OGradDataType> do_host_ref({nhead, real_seqlen_q, hdim_v}); // do_g_m_o
-        ck_tile::HostTensor<AccDataType> ds_hp_host_ref(
-            {nhead, real_seqlen_q, real_seqlen_k}); // ds_g_m_n high precision
-        ck_tile::HostTensor<GemmDataType> ds_lp_host_ref(
-            {nhead, real_seqlen_q, real_seqlen_k}); // ds_g_m_n low precision
-        ck_tile::HostTensor<AccDataType> dp_hp_host_ref(
-            {nhead, real_seqlen_q, real_seqlen_k}); // dp_g_m_n high precision
-        ck_tile::HostTensor<BiasGradDataType> dbias_host_ref(
-            {nhead, real_seqlen_q, real_seqlen_k}); // dbias_g_m_n
-        ck_tile::HostTensor<QGradDataType> dq_host_ref({nhead, real_seqlen_q, hdim_q}); // dq_g_m_k
-        ck_tile::HostTensor<KGradDataType> dk_host_ref({nhead, real_seqlen_k, hdim_q}); // dk_g_n_k
-        ck_tile::HostTensor<VGradDataType> dv_host_ref({nhead, real_seqlen_k, hdim_v}); // dv_g_n_o
-
-        // clang-format off
-        if(o_perm) do_host_ref.ForEach([&](auto& self, auto i) { self(i) = do_host(b, i[0], i[1] + query_offset, i[2]); });
-        else       do_host_ref.ForEach([&](auto& self, auto i) { self(i) = do_host(b, i[1] + query_offset, i[0], i[2]); });
-        // clang-format on
-
-        // dP = dO@V x Z w/  dropout
-        // dP = dO@V     w/o dropout
-        auto v_t_host_ref = v_host_refs[wb].transpose({0, 2, 1}); // v_g_o_n -> v_g_n_o
-        ck_tile::reference_batched_gemm<OGradDataType, VDataType, AccDataType, AccDataType>(
-            do_host_ref, v_t_host_ref, dp_hp_host_ref); // dp_g_m_n = do_g_m_o@v_g_n_o
-
-        if(p_drop > 0)
-        {
-            ck_tile::reference_batched_dropout(
-                dp_hp_host_ref, randval_host_refs[wb], p_undrop_in_uint8_t, rp_undrop);
-        }
-
-        // dS_i_j = P_i_j .* (dP_i_j - dO_i dot O_i)
-        ck_tile::make_ParallelTensorFunctor(
-            [&](auto i0, auto i1, auto i2) {
-                AccDataType do_dot_o = 0;
-                for(int o = 0; o < hdim_v; o++)
-                {
-                    do_dot_o += ck_tile::type_convert<AccDataType>(do_host_ref(i0, i1, o)) *
-                                ck_tile::type_convert<AccDataType>(o_host_refs[wb](i0, i1, o));
-                }
-                ds_hp_host_ref(i0, i1, i2) = ck_tile::type_convert<AccDataType>(
-                    p_hp_host_refs[wb](i0, i1, i2) * (dp_hp_host_ref(i0, i1, i2) - do_dot_o));
-            },
-            ds_hp_host_ref.mDesc.get_lengths()[0],
-            ds_hp_host_ref.mDesc.get_lengths()[1],
-            ds_hp_host_ref.mDesc.get_lengths()[2])(std::thread::hardware_concurrency());
-
-        if(use_dbias)
-        {
-            dbias_host_ref = ds_hp_host_ref.template CopyAsType<BiasGradDataType>();
-        }
-
-        ds_lp_host_ref = ds_hp_host_ref.template CopyAsType<GemmDataType>();
-
-        // dV = P_drop^T@dO^T
-        // dV = P^T@dO^T w/o dropout
-        auto p_t_lp_host_ref = p_lp_host_refs[wb].transpose({0, 2, 1}); // p_lp_g_m_n -> p_lp_g_n_m
-        auto do_t_host_ref   = do_host_ref.transpose({0, 2, 1});        // do_g_m_o -> do_g_o_m
-        ck_tile::reference_batched_gemm<GemmDataType, OGradDataType, AccDataType, VGradDataType>(
-            p_t_lp_host_ref, do_t_host_ref, dv_host_ref); // dv_g_n_o = p_lp_g_n_m@do_g_o_m
-
-        // dQ = scale * dS@K^T
-        auto k_t_host_ref = k_host_refs[wb].transpose({0, 2, 1}); // k_g_n_k -> k_g_k_n
-        ck_tile::reference_batched_gemm<GemmDataType, KDataType, AccDataType, QGradDataType>(
-            ds_lp_host_ref,
-            k_t_host_ref,
-            dq_host_ref,
-            ck_tile::identity{},
-            ck_tile::identity{},
-            ck_tile::scales(scale)); // dq_g_m_k = ds_g_m_n@k_g_k_n
-
-        // dK = scale * dS^T@Q^T
-        auto ds_t_lp_host_ref = ds_lp_host_ref.transpose({0, 2, 1});  // ds_g_m_n -> ds_g_n_m
-        auto q_t_host_ref     = q_host_refs[wb].transpose({0, 2, 1}); // q_g_m_k -> q_g_k_m
-        ck_tile::reference_batched_gemm<GemmDataType, QDataType, AccDataType, KGradDataType>(
-            ds_t_lp_host_ref,
-            q_t_host_ref,
-            dk_host_ref,
-            ck_tile::identity{},
-            ck_tile::identity{},
-            ck_tile::scales(scale)); // dk_g_n_k = ds_g_n_m@q_g_k_m
-
-        ck_tile::HostTensor<QGradDataType> dq_host_result(
-            {nhead, real_seqlen_q, hdim_q}); // dq_g_m_k
-        ck_tile::HostTensor<KGradDataType> dk_host_result(
-            {nhead, real_seqlen_k, hdim_q}); // dk_g_n_k
-        ck_tile::HostTensor<VGradDataType> dv_host_result(
-            {nhead, real_seqlen_k, hdim_v}); // dv_g_n_o
-        ck_tile::HostTensor<BiasGradDataType> dbias_host_result(
-            {nhead, real_seqlen_q, real_seqlen_k}); // dbias_g_m_n
-
-        // clang-format off
-        // permute
-        if(i_perm) dq_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dq_host(b, idx[0], idx[1] + query_offset, idx[2]); });
-        else       dq_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dq_host(b, idx[1] + query_offset, idx[0], idx[2]); });
-
-        if(i_perm) dk_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dk_host(b, idx[0], idx[1] + key_offset, idx[2]); });
-        else       dk_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dk_host(b, idx[1] + key_offset, idx[0], idx[2]); });
-
-        if(i_perm) dv_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dv_host(b, idx[0], idx[1] + key_offset, idx[2]); });
-        else       dv_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dv_host(b, idx[1] + key_offset, idx[0], idx[2]); });
-
-        if(use_dbias)
-        {
-            if(i_perm) dbias_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dbias_host(b, idx[0], idx[1] + query_offset, idx[2]); });
-            else       dbias_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dbias_host(b, idx[1] + query_offset, idx[0], idx[2]); });
-        }
-        // clang-format on
-
-        auto [rtol, atol] = get_elimit<DataTypeConfig>(hdim_q, hdim_v);
-        bool dq_cur_pass  = ck_tile::check_err(dq_host_result,
-                                              dq_host_ref,
-                                              std::string("Error: QGrad Incorrect results!"),
-                                              rtol,
-                                              atol);
-        bool dk_cur_pass  = ck_tile::check_err(dk_host_result,
-                                              dk_host_ref,
-                                              std::string("Error: KGrad Incorrect results!"),
-                                              rtol,
-                                              atol);
-        bool dv_cur_pass  = ck_tile::check_err(dv_host_result,
-                                              dv_host_ref,
-                                              std::string("Error: VGrad Incorrect results!"),
-                                              rtol,
-                                              atol);
-
-        bool dbias_cur_pass = true;
-        if(use_dbias)
-        {
-            dbias_cur_pass = ck_tile::check_err(dbias_host_result,
-                                                dbias_host_ref,
-                                                std::string("Error: BiasGrad Incorrect results!"),
-                                                rtol,
-                                                atol);
-        }
-        pass &= (dq_cur_pass & dk_cur_pass & dv_cur_pass & dbias_cur_pass);
-        if(!(dq_cur_pass & dk_cur_pass & dv_cur_pass & dbias_cur_pass))
-        {
-            std::cerr << "mismatch found at batch: " << wb << std::endl
-                      << "\tseqlen_q: " << real_seqlen_q << std::endl
-                      << "\tseqlen_k: " << real_seqlen_k << std::endl
-                      << "\tseqstart_q: " << seqstart_q_host << std::endl
-                      << "\tseqstart_k: " << seqstart_k_host << std::endl;
-
-            break;
-        }
-    }
-
-    std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
-
-    return pass;
-}
-
-int main(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
-    {
-        return run<FmhaBwdFp16>(arg_parser) ? 0 : -2;
-    }
-    else if(data_type == "bf16")
-    {
-        return run<FmhaBwdBf16>(arg_parser) ? 0 : -2;
-    }
-
-    return -3;
-}
diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index 8d35b2d12c..6cd1cd94fa 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -15,6 +15,10 @@
 #include <utility>
 #include <variant>
 
+struct FmhaBwdFp32
+{
+};
+
 struct FmhaBwdFp16
 {
 };
@@ -26,6 +30,26 @@ struct FmhaBwdBf16
 template <typename DataType>
 struct FmhaBwdTypeConfig;
 
+template <>
+struct FmhaBwdTypeConfig<FmhaBwdFp32>
+{
+    using QDataType             = float;
+    using KDataType             = float;
+    using VDataType             = float;
+    using GemmDataType          = float;
+    using BiasDataType          = float;
+    using LSEDataType           = float;
+    using AccDataType           = float; // data type for gemm accumulation
+    using DDataType             = float;
+    using RandValOutputDataType = uint8_t;
+    using ODataType             = float;
+    using OGradDataType         = float;
+    using QGradDataType         = float;
+    using KGradDataType         = float;
+    using VGradDataType         = float;
+    using BiasGradDataType      = float;
+};
+
 template <>
 struct FmhaBwdTypeConfig<FmhaBwdFp16>
 {
@@ -368,11 +392,12 @@ template <ck_tile::index_t HDim_,
           typename FmhaDropout_,
           ck_tile::BlockAttentionBiasEnum BiasEnum_,
           bool kHasBiasGrad_,
-          bool kPadD_,
-          bool kPadDv_,
+          ck_tile::index_t kPadD_,
+          ck_tile::index_t kPadDv_,
           bool kIsDeterministic_,
           bool kUseTrLoad_,
-          ck_tile::index_t MaxSeqLenQ_>
+          ck_tile::index_t MaxSeqLenQ_,
+          ck_tile::index_t kN0>
 struct fmha_bwd_dq_dk_dv_traits_
 {
 };
@@ -412,15 +437,10 @@ template <ck_tile::index_t HDim_,
           bool kIsGroupMode_,
           bool kPadS_,
           bool kPadD_,
-          bool kIsDeterministic_>
+          bool kIsDeterministic_,
+          ck_tile::index_t kN0>
 struct fmha_bwd_convert_dq_traits_
 {
-    static constexpr ck_tile::index_t HDim = HDim_;
-    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
-    static constexpr bool kIsGroupMode     = kIsGroupMode_;
-    static constexpr bool kPadS            = kPadS_;
-    static constexpr bool kPadD            = kPadD_;
-    static constexpr bool kIsDeterministic = kIsDeterministic_;
 };
 
 template <typename Traits_>
diff --git a/example/ck_tile/01_fmha/fmha_bwd_runner.hpp b/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
new file mode 100644
index 0000000000..b6f2c8ca30
--- /dev/null
+++ b/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
@@ -0,0 +1,1010 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/host.hpp"
+#include "fmha_bwd.hpp"
+#include "utils.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+
+#include <array>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+enum class bwd_result
+{
+    success,
+    failure,
+    invalid_args,
+    no_instance,
+};
+
+// different threshold for different dtype
+template <typename DataTypeConfig>
+auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<FmhaBwdFp32>(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/)
+{
+    double rtol = 1e-4;
+    double atol = 1e-4;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<FmhaBwdBf16>(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    if(hdim_q > 128 && hdim_v > 128) // 3.2 for RTZ/1.5 for RTN
+    {
+        rtol = 3.2e-2;
+        atol = 3.2e-2;
+    }
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+extern template float fmha_bwd<2>(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);
+
+template <typename DataTypeConfig>
+bwd_result fmha_bwd_run(mode_enum mode,
+                        ck_tile::index_t batch,
+                        ck_tile::index_t nhead,
+                        ck_tile::index_t nhead_k,
+                        std::vector<ck_tile::index_t> seqlen_qs,
+                        std::vector<ck_tile::index_t> seqlen_ks,
+                        ck_tile::index_t hdim_q,
+                        ck_tile::index_t hdim_v,
+                        bool i_perm,
+                        bool o_perm,
+                        float scale,
+                        std::string bias_str,
+                        bool use_dbias,
+                        float p_drop,
+                        uint64_t drop_seed,
+                        uint64_t drop_offset,
+                        bool drop_prefs,
+                        std::string mask_str,
+                        bool deterministic,
+                        std::string init_method,
+                        uint32_t seed,
+                        int do_validation,
+                        const ck_tile::stream_config& stream_config,
+                        std::optional<std::string> json = std::nullopt)
+{
+    const std::string data_type = []() {
+        if constexpr(std::is_same_v<DataTypeConfig, FmhaBwdFp32>)
+            return "fp32";
+        else if constexpr(std::is_same_v<DataTypeConfig, FmhaBwdFp16>)
+            return "fp16";
+        else if constexpr(std::is_same_v<DataTypeConfig, FmhaBwdBf16>)
+            return "bf16";
+        else
+            static_assert(false);
+    }();
+
+    if(nhead_k < 0)
+        nhead_k = nhead;
+    if(nhead % nhead_k != 0)
+    {
+        std::cerr << "nhead:" << nhead << " must be multiple of nhead_k:" << nhead_k << std::endl;
+        return bwd_result::invalid_args;
+    }
+
+    std::mt19937 random_engine(seed != 0 ? seed : std::random_device{}());
+    auto next_seed = [&random_engine]() { return static_cast<unsigned int>(random_engine()); };
+
+    if(hdim_v < 0)
+        hdim_v = hdim_q;
+
+    if(scale == .0f)
+        scale = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q));
+
+    bias_info bias = bias_info::decode(bias_str);
+
+    if(use_dbias && bias.type != bias_enum::elementwise_bias)
+    {
+        std::cerr << "dbias only exists when bias type is elementwise" << std::endl;
+        return bwd_result::invalid_args;
+    }
+    std::vector<ck_tile::index_t> seqlen_kpads;
+    std::tie(seqlen_qs, seqlen_ks, seqlen_kpads) =
+        generate_missing_seqlens(mode, batch, seqlen_qs, seqlen_ks, {}, 0, false, random_engine);
+    ck_tile::ignore = seqlen_kpads;
+#if 0
+    std::cout << "seqlen_qs: " << seqlen_qs << std::endl;
+    std::cout << "seqlen_ks: " << seqlen_ks << std::endl;
+#endif
+
+    mask_info mask = mask_info::decode(mask_str, seqlen_qs[0], seqlen_ks[0]);
+
+    if(p_drop < 0.0f || p_drop > 1.0f)
+    {
+        std::cerr << "The value of p_drop should be 0~1" << std::endl;
+        return bwd_result::invalid_args;
+    }
+    float p_undrop = 1.0 - p_drop;
+    uint8_t p_undrop_in_uint8_t =
+        uint8_t(std::floor(p_undrop * std::numeric_limits<uint8_t>::max()));
+    float rp_undrop = 1.0 / p_undrop;
+
+    bool s_randval = false;
+    if(p_drop > 0.0f && do_validation)
+    {
+        s_randval = true;
+    }
+
+    const auto seqstart_q_host = to_seqstarts(seqlen_qs);
+    const auto seqstart_k_host = to_seqstarts(seqlen_ks);
+
+    using TypeConfig = FmhaBwdTypeConfig<DataTypeConfig>;
+
+    using QDataType             = typename TypeConfig::QDataType;
+    using KDataType             = typename TypeConfig::KDataType;
+    using VDataType             = typename TypeConfig::VDataType;
+    using GemmDataType          = typename TypeConfig::GemmDataType;
+    using BiasDataType          = typename TypeConfig::BiasDataType;
+    using LSEDataType           = typename TypeConfig::LSEDataType;
+    using AccDataType           = typename TypeConfig::AccDataType;
+    using DDataType             = typename TypeConfig::DDataType;
+    using RandValOutputDataType = typename TypeConfig::RandValOutputDataType;
+    using ODataType             = typename TypeConfig::ODataType;
+    using OGradDataType         = typename TypeConfig::OGradDataType;
+    using QGradDataType         = typename TypeConfig::QGradDataType;
+    using KGradDataType         = typename TypeConfig::KGradDataType;
+    using VGradDataType         = typename TypeConfig::VGradDataType;
+    using BiasGradDataType      = typename TypeConfig::BiasGradDataType;
+
+    // accumulation numbers for performance evaluation
+    std::size_t flop = 0, num_byte = 0;
+    auto max_seqlen_q =
+        std::numeric_limits<int32_t>::min(); // we will use max seqlen to decide grid size
+    auto max_seqlen_k =
+        std::numeric_limits<int32_t>::min(); // we will use max seqlen to decide grid size
+    {
+        for(ck_tile::index_t wb = 0; wb < batch; ++wb)
+        {
+            const int32_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
+            const int32_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
+
+            if(max_seqlen_q < real_seqlen_q)
+            {
+                max_seqlen_q = real_seqlen_q;
+            }
+
+            if(max_seqlen_k < real_seqlen_k)
+            {
+                max_seqlen_k = real_seqlen_k;
+            }
+
+            flop += nhead * (static_cast<std::size_t>(3) * static_cast<std::size_t>(2) *
+                                 real_seqlen_q * real_seqlen_k * hdim_q + // Q@K/dS^T@Q^T/dS@K^T
+                             static_cast<std::size_t>(2) * static_cast<std::size_t>(2) *
+                                 real_seqlen_q * real_seqlen_k * hdim_v); // dO@V/P^T@dO^T
+
+            num_byte += nhead * (sizeof(QDataType) * real_seqlen_q * hdim_q +
+                                 sizeof(KDataType) * real_seqlen_k * hdim_q +
+                                 sizeof(VDataType) * real_seqlen_k * hdim_v +
+                                 sizeof(ODataType) * real_seqlen_q * hdim_v +
+                                 sizeof(OGradDataType) * real_seqlen_q * hdim_v +
+                                 sizeof(QGradDataType) * real_seqlen_q * hdim_q +
+                                 sizeof(KGradDataType) * real_seqlen_k * hdim_q +
+                                 sizeof(VGradDataType) * real_seqlen_k * hdim_v +
+                                 sizeof(LSEDataType) * real_seqlen_q);
+        }
+    }
+
+    auto get_lengths = [&](bool permute,
+                           ck_tile::index_t b /*batch*/,
+                           ck_tile::index_t h /*nhead*/,
+                           ck_tile::index_t s /*seqlen*/,
+                           ck_tile::index_t d /*hdim*/) {
+        if(permute)
+            return std::array<ck_tile::index_t, 4>{b, h, s, d};
+        else
+            return std::array<ck_tile::index_t, 4>{b, s, h, d};
+    };
+
+    // host memory for storing all the tensor elements
+    const ck_tile::index_t shape_batch = (mode == mode_enum::batch ? batch : 1);
+    const ck_tile::index_t shape_seqlen_q =
+        (mode == mode_enum::batch ? seqlen_qs[0] : seqstart_q_host.back());
+    const ck_tile::index_t shape_seqlen_k =
+        (mode == mode_enum::batch ? seqlen_ks[0] : seqstart_k_host.back());
+    // Keep it equal to or smaller than minimal bn0 of all tiles in fmha_bwd.py
+    // TODO: add API for requesting kN0/nsplits/workspace_size? It is not safe to rely on internal
+    // implementation details in client code.
+    const ck_tile::index_t kN0 = 16;
+    const ck_tile::index_t nsplits =
+        deterministic ? ck_tile::integer_divide_ceil(max_seqlen_k, kN0) : 1;
+
+    ck_tile::HostTensor<QDataType> q_host(
+        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, hdim_q));
+    ck_tile::HostTensor<KDataType> k_host(
+        get_lengths(i_perm, shape_batch, nhead_k, shape_seqlen_k, hdim_q));
+    ck_tile::HostTensor<VDataType> v_host(
+        get_lengths(i_perm, shape_batch, nhead_k, shape_seqlen_k, hdim_v));
+    ck_tile::HostTensor<BiasDataType> bias_host(
+        bias.type == bias_enum::elementwise_bias
+            ? get_lengths(i_perm, 1, 1, shape_seqlen_q, max_seqlen_k)
+            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
+    ck_tile::HostTensor<AccDataType> alibi_slope_host(
+        bias.type == bias_enum::alibi
+            ? (bias.rank_info == 0 ? std::array<ck_tile::index_t, 2>{1, nhead}
+                                   : std::array<ck_tile::index_t, 2>{batch, nhead})
+            : std::array<ck_tile::index_t, 2>{1, 1});
+    ck_tile::HostTensor<ODataType> o_host(
+        get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
+    ck_tile::HostTensor<LSEDataType> lse_host(
+        std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
+    ck_tile::HostTensor<DDataType> d_host(
+        std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
+    ck_tile::HostTensor<RandValOutputDataType> randval_host(
+        p_drop > 0 ? get_lengths(true, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
+                   : std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
+    ck_tile::HostTensor<QGradDataType> dq_host(
+        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, hdim_q));
+    ck_tile::HostTensor<KGradDataType> dk_host(
+        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_k, hdim_q));
+    ck_tile::HostTensor<VGradDataType> dv_host(
+        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_k, hdim_v));
+    ck_tile::HostTensor<OGradDataType> do_host(
+        get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
+    ck_tile::HostTensor<BiasGradDataType> dbias_host(
+        use_dbias
+            ? get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
+            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
+    ck_tile::HostTensor<AccDataType> dq_acc_host(
+        i_perm
+            ? std::array<ck_tile::index_t, 5>{nsplits, shape_batch, nhead, shape_seqlen_q, hdim_q}
+            : std::array<ck_tile::index_t, 5>{nsplits, shape_batch, shape_seqlen_q, nhead, hdim_q});
+
+    if(init_method == "ui" || init_method == "0")
+    {
+        ck_tile::FillUniformDistributionIntegerValue<QDataType>{-2.f, 2.f, next_seed()}(q_host);
+        ck_tile::FillUniformDistributionIntegerValue<KDataType>{-2.f, 2.f, next_seed()}(k_host);
+        ck_tile::FillUniformDistributionIntegerValue<VDataType>{-2.f, 2.f, next_seed()}(v_host);
+        ck_tile::FillUniformDistributionIntegerValue<BiasDataType>{-2.f, 2.f, next_seed()}(
+            bias_host);
+        ck_tile::FillUniformDistributionIntegerValue<OGradDataType>{-2.f, 2.f, next_seed()}(
+            do_host);
+    }
+    else if(init_method == "uf" || init_method == "1")
+    {
+        ck_tile::FillUniformDistribution<QDataType>{0.f, 1.f, next_seed()}(q_host);
+        ck_tile::FillUniformDistribution<KDataType>{0.f, 1.f, next_seed()}(k_host);
+        ck_tile::FillUniformDistribution<VDataType>{0.f, 1.f, next_seed()}(v_host);
+        ck_tile::FillUniformDistribution<BiasDataType>{0.f, 1.f, next_seed()}(bias_host);
+        ck_tile::FillUniformDistribution<OGradDataType>{0.f, 1.f, next_seed()}(do_host);
+    }
+    else if(init_method == "tf" || init_method == "2")
+    {
+        ck_tile::FillTrigValue<QDataType>{}(q_host);
+        ck_tile::FillTrigValue<KDataType>{}(k_host);
+        ck_tile::FillTrigValue<VDataType>{}(v_host);
+        ck_tile::FillTrigValue<BiasDataType>{}(bias_host);
+        ck_tile::FillTrigValue<OGradDataType>{}(do_host);
+    }
+    else
+    {
+        std::cerr << "Unknown value for init argument: " << init_method << std::endl;
+        return bwd_result::invalid_args;
+    }
+
+    if(bias.type == bias_enum::alibi)
+    {
+        auto slopes = ck_tile::get_alibi_slopes<AccDataType>(nhead);
+        assert(slopes.size() == static_cast<decltype(slopes.size())>(nhead));
+        if(bias.rank_info == 0)
+        {
+            // alibi in 1*h
+            std::copy(slopes.begin(), slopes.end(), alibi_slope_host.begin());
+        }
+        else
+        {
+            // alibi in b*h
+            for(auto i_b = 0; i_b < batch; i_b++)
+            {
+                std::copy(slopes.begin(), slopes.end(), alibi_slope_host.begin() + i_b * nhead);
+            }
+        }
+    }
+
+    ck_tile::DeviceMem q_buf(q_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem k_buf(k_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem v_buf(v_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem bias_buf(bias_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem lse_buf(lse_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_buf(d_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem randval_buf(randval_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dq_buf(dq_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dk_buf(dk_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dv_buf(dv_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem do_buf(do_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dbias_buf(dbias_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
+    ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
+    ck_tile::DeviceMem drop_seed_buf(drop_prefs ? sizeof(uint64_t) : 0);
+    ck_tile::DeviceMem drop_offset_buf(drop_prefs ? sizeof(uint64_t) : 0);
+    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dq_acc_buf(dq_acc_host.get_element_space_size_in_bytes());
+
+    q_buf.ToDevice(q_host.data());
+    k_buf.ToDevice(k_host.data());
+    v_buf.ToDevice(v_host.data());
+    bias_buf.ToDevice(bias_host.data());
+    do_buf.ToDevice(do_host.data());
+    seqstart_q.ToDevice(seqstart_q_host.data());
+    seqstart_k.ToDevice(seqstart_k_host.data());
+    drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
+    drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
+    alibi_slope_buf.ToDevice(alibi_slope_host.data());
+
+    // clang-format off
+    auto layout_str = [&](bool permute){
+        if (permute) return std::string("bhsd");
+        else return std::string("bshd");
+    };
+    auto io_layout = [&](bool iperm_, bool operm_) {
+        if (iperm_ == operm_) return layout_str(iperm_);
+        else return layout_str(iperm_) + std::string("-") + layout_str(operm_);
+    };
+    // clang-format on
+
+    const std::size_t workspace_size_in_megabytes =
+        ck_tile::integer_divide_ceil(dq_acc_host.get_element_space_size_in_bytes(), 1024 * 1024);
+
+    std::cout << "[" << data_type << "|" << mode << "|" << io_layout(i_perm, o_perm)
+              << "] b:" << batch << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_qs[0]
+              << "/" << seqlen_ks[0] << ", d:" << hdim_q << "/" << hdim_v << ", scale:" << scale
+              << ", bias:" << bias << ", dbias:" << use_dbias << ", p_drop:" << p_drop
+              << ", s_randval:" << s_randval << ", deterministic:" << deterministic
+              << (deterministic ? std::string(", workspace:") +
+                                      std::to_string(workspace_size_in_megabytes) + "MiB"
+                                : "")
+              << ", mask:" << mask << std::flush;
+
+    auto fmha_traits = fmha_bwd_traits{hdim_q,
+                                       hdim_v,
+                                       data_type,
+                                       mode == mode_enum::group,
+                                       mask.type,
+                                       bias.type,
+                                       use_dbias,
+                                       p_drop > 0.0f,
+                                       s_randval,
+                                       deterministic};
+    auto fmha_args   = [&]() {
+        /// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q,
+        ///       seqlen_k] in this example, hence both the 'batch_stride_bias' &
+        ///       'nhead_stride_bias' are 0.
+        // setup stride_* arguments
+        const ck_tile::index_t stride_q       = (i_perm ? hdim_q : nhead * hdim_q);
+        const ck_tile::index_t stride_k       = (i_perm ? hdim_q : nhead_k * hdim_q);
+        const ck_tile::index_t stride_v       = (i_perm ? hdim_v : nhead_k * hdim_v);
+        const ck_tile::index_t stride_bias    = (max_seqlen_k);
+        const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
+        const ck_tile::index_t stride_randval = (max_seqlen_k);
+        const ck_tile::index_t stride_do      = (o_perm ? hdim_v : nhead * hdim_v);
+        const ck_tile::index_t stride_dk      = (i_perm ? hdim_q : nhead * hdim_q);
+        const ck_tile::index_t stride_dv      = (i_perm ? hdim_v : nhead * hdim_v);
+        const ck_tile::index_t stride_dbias   = (i_perm ? max_seqlen_k : nhead * max_seqlen_k);
+        // setup nhead_stride_* arguments
+        const ck_tile::index_t nhead_stride_q       = (i_perm ? shape_seqlen_q * hdim_q : hdim_q);
+        const ck_tile::index_t nhead_stride_k       = (i_perm ? shape_seqlen_k * hdim_q : hdim_q);
+        const ck_tile::index_t nhead_stride_v       = (i_perm ? shape_seqlen_k * hdim_v : hdim_v);
+        const ck_tile::index_t nhead_stride_bias    = 0;
+        const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
+        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
+        const ck_tile::index_t nhead_stride_do      = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
+        const ck_tile::index_t nhead_stride_lsed    = shape_seqlen_q;
+        const ck_tile::index_t nhead_stride_dbias =
+            (i_perm ? shape_seqlen_q * max_seqlen_k : max_seqlen_k);
+        // setup batch_stride_* arguments
+        const ck_tile::index_t batch_stride_q       = (nhead * shape_seqlen_q * hdim_q);
+        const ck_tile::index_t batch_stride_k       = (nhead_k * shape_seqlen_k * hdim_q);
+        const ck_tile::index_t batch_stride_v       = (nhead_k * shape_seqlen_k * hdim_v);
+        const ck_tile::index_t batch_stride_bias    = 0;
+        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
+        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
+        const ck_tile::index_t batch_stride_do      = (nhead * shape_seqlen_q * hdim_v);
+        const ck_tile::index_t batch_stride_lsed    = (nhead * shape_seqlen_q);
+        const ck_tile::index_t batch_stride_dk      = (nhead * shape_seqlen_k * hdim_q);
+        const ck_tile::index_t batch_stride_dv      = (nhead * shape_seqlen_k * hdim_v);
+        const ck_tile::index_t batch_stride_dbias   = (nhead * shape_seqlen_q * max_seqlen_k);
+        const ck_tile::index_t split_stride_dq_acc =
+            (shape_batch * nhead * shape_seqlen_q * hdim_q);
+
+        const auto drop_seed_offset = [&]() -> decltype(fmha_bwd_args::drop_seed_offset) {
+            if(drop_prefs)
+            {
+                return std::make_pair(drop_seed_buf.GetDeviceBuffer(),
+                                      drop_offset_buf.GetDeviceBuffer());
+            }
+            else
+            {
+                return std::make_pair(drop_seed, drop_offset);
+            }
+        }();
+
+        return fmha_bwd_args{q_buf.GetDeviceBuffer(),
+                             k_buf.GetDeviceBuffer(),
+                             v_buf.GetDeviceBuffer(),
+                             bias.type == bias_enum::alibi ? alibi_slope_buf.GetDeviceBuffer()
+                                                             : bias_buf.GetDeviceBuffer(),
+                             o_buf.GetDeviceBuffer(),
+                             lse_buf.GetDeviceBuffer(),
+                             do_buf.GetDeviceBuffer(),
+                             d_buf.GetDeviceBuffer(),
+                             randval_buf.GetDeviceBuffer(),
+                             dq_buf.GetDeviceBuffer(),
+                             dk_buf.GetDeviceBuffer(),
+                             dv_buf.GetDeviceBuffer(),
+                             dbias_buf.GetDeviceBuffer(),
+                             dq_acc_buf.GetDeviceBuffer(),
+                             seqstart_q.GetDeviceBuffer(),
+                             seqstart_k.GetDeviceBuffer(),
+                             nullptr,
+                             shape_seqlen_q,
+                             shape_seqlen_k,
+                             batch,
+                             max_seqlen_q,
+                             max_seqlen_k,
+                             hdim_q,
+                             hdim_v,
+                             nhead,
+                             nhead_k,
+                             scale,
+                             stride_q,
+                             stride_k,
+                             stride_v,
+                             bias.type == bias_enum::alibi ? (bias.rank_info == 0 ? 0 : nhead)
+                                                             : stride_bias,
+                             stride_o,
+                             stride_randval,
+                             stride_do,
+                             stride_q, // stride_dq_acc
+                             stride_q, // stride_dq
+                             stride_dk,
+                             stride_dv,
+                             stride_dbias,
+                             nhead_stride_q,
+                             nhead_stride_k,
+                             nhead_stride_v,
+                             nhead_stride_bias,
+                             nhead_stride_o,
+                             nhead_stride_randval,
+                             nhead_stride_do,
+                             nhead_stride_lsed,
+                             nhead_stride_q, // nhead_stride_dq_acc
+                             nhead_stride_q, // nhead_stride_dq
+                             nhead_stride_k, // nhead_stride_dk
+                             nhead_stride_v, // nhead_stride_dv
+                             nhead_stride_dbias,
+                             batch_stride_q,
+                             batch_stride_k,
+                             batch_stride_v,
+                             batch_stride_bias,
+                             batch_stride_o,
+                             batch_stride_randval,
+                             batch_stride_do,
+                             batch_stride_lsed,
+                             batch_stride_q, // batch_stride_dq_acc
+                             batch_stride_q, // batch_stride_dq
+                             batch_stride_dk,
+                             batch_stride_dv,
+                             batch_stride_dbias,
+                             split_stride_dq_acc,
+                             mask.left,
+                             mask.right,
+                             static_cast<ck_tile::index_t>(mask.type),
+                             p_drop,
+                             p_undrop,
+                             drop_seed_offset};
+    }();
+
+    const float ave_time = fmha_bwd(fmha_traits, fmha_args, stream_config);
+    if(ave_time < 0)
+    {
+        std::cout << ", not supported yet" << std::flush << std::endl;
+        return bwd_result::no_instance;
+    }
+
+    const float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    const float gb_per_sec = num_byte / 1.E6 / ave_time;
+    if(stream_config.time_kernel_)
+    {
+        std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
+                  << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2)
+                  << gb_per_sec << " GB/s" << std::flush;
+    }
+
+    bool pass = true;
+    if(!do_validation)
+    {
+        std::cout << std::flush << std::endl;
+    }
+    else
+    {
+        std::vector<ck_tile::HostTensor<QDataType>> q_host_refs;
+        std::vector<ck_tile::HostTensor<KDataType>> k_host_refs;
+        std::vector<ck_tile::HostTensor<VDataType>> v_host_refs;
+        std::vector<ck_tile::HostTensor<ODataType>> o_host_refs;
+        std::vector<ck_tile::HostTensor<RandValOutputDataType>> randval_host_refs;
+        std::vector<ck_tile::HostTensor<AccDataType>> p_hp_host_refs;
+        std::vector<ck_tile::HostTensor<GemmDataType>> p_lp_host_refs;
+
+        randval_buf.FromDevice(randval_host.data());
+
+        for(ck_tile::index_t wb = 0; wb < batch; ++wb)
+        {
+            const ck_tile::index_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
+            const ck_tile::index_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
+
+            // adjust matrix index according to the mode
+            const ck_tile::index_t b = (mode == mode_enum::batch ? wb : 0);
+            const ck_tile::index_t query_offset =
+                (mode == mode_enum::batch ? 0 : seqstart_q_host[wb]);
+            const ck_tile::index_t key_offset =
+                (mode == mode_enum::batch ? 0 : seqstart_k_host[wb]);
+
+            ck_tile::HostTensor<QDataType> q_host_ref({nhead, real_seqlen_q, hdim_q}); // q_g_m_k
+            ck_tile::HostTensor<KDataType> k_host_ref({nhead, real_seqlen_k, hdim_q}); // k_g_n_k
+            ck_tile::HostTensor<VDataType> v_host_ref({nhead, hdim_v, real_seqlen_k}); // v_g_o_n
+            ck_tile::HostTensor<ODataType> o_host_ref({nhead, real_seqlen_q, hdim_v}); // o_g_m_o
+            ck_tile::HostTensor<LSEDataType> lse_host_ref({nhead, real_seqlen_q});     // lse_g_m
+            ck_tile::HostTensor<RandValOutputDataType> randval_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k}); // randval_g_m_n
+            ck_tile::HostTensor<AccDataType> s_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k}); // s_g_m_n
+            ck_tile::HostTensor<AccDataType> p_hp_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k}); // p_hp_g_m_n high precision
+            ck_tile::HostTensor<AccDataType> p_dropped_hp_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k}); // p_dropped_hp_g_m_n high precision
+            ck_tile::HostTensor<GemmDataType> p_lp_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k}); // p_lp_g_m_n low precision
+
+            ck_tile::index_t nr = nhead / nhead_k;
+
+            // clang-format off
+            // permute
+            if(i_perm) q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b, i[0], i[1] + query_offset, i[2]); });
+            else       q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b, i[1] + query_offset, i[0], i[2]); });
+
+            if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(b, i[0] / nr, i[1] + key_offset, i[2]); });
+            else       k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(b, i[1] + key_offset, i[0] / nr, i[2]); });
+
+            // v_host_ref: [nhead, hdim, seq], v_host: [b, h_k, s, d]
+            if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[0] / nr, i[2] + key_offset, i[1]); });
+            // v_host_ref: [nhead, hdim, seq], v_host: [b, s, h_k, d]
+            else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[2] + key_offset, i[0] / nr, i[1]); });
+            // clang-format on
+
+            // reference
+            // S = scale * Q * K^T
+            ck_tile::reference_batched_gemm<QDataType, KDataType, AccDataType, AccDataType>(
+                q_host_ref,
+                k_host_ref,
+                s_host_ref,
+                ck_tile::identity{},
+                ck_tile::identity{},
+                ck_tile::scales(scale)); // s_g_m_n = scale * q_g_m_k@k_g_n_k
+
+            if(bias.type == bias_enum::elementwise_bias)
+            {
+                // elementwise bias
+                ck_tile::HostTensor<BiasDataType> bias_host_ref({1, real_seqlen_q, real_seqlen_k});
+                // clang-format off
+                if(i_perm)
+                    bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2]); });
+                else
+                    bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2]); });
+                // clang-format on
+
+                // broadcast from [1, real_seqlen_q, real_seqlen_k] to [nhead, real_seqlen_q,
+                // real_seqlen_k]
+                ck_tile::reference_batched_elementwise<AccDataType,
+                                                       BiasDataType,
+                                                       AccDataType,
+                                                       AccDataType>(
+                    s_host_ref, bias_host_ref, s_host_ref);
+            }
+            else if(bias.type == bias_enum::alibi)
+            {
+                // alibi construct elementwise bias to verify
+                auto alibi_host = [&]() {
+                    if(mask.type != mask_enum::no_mask)
+                    {
+                        return ck_tile::make_alibi_from_lr_mask<AccDataType, false>(
+                            0,
+                            mask.left,
+                            mask.right,
+                            real_seqlen_q,
+                            real_seqlen_k,
+                            static_cast<ck_tile::GenericAttentionMaskEnum>(mask.type));
+                    }
+                    else
+                    {
+                        return ck_tile::Alibi<AccDataType, false>{
+                            0, real_seqlen_q, real_seqlen_k, ck_tile::AlibiMode::FROM_BOTTOM_RIGHT};
+                    }
+                }();
+
+                ck_tile::HostTensor<AccDataType> alibi_bias_host_ref(
+                    {nhead, real_seqlen_q, real_seqlen_k});
+                auto i_b_slope = bias.rank_info == 0 ? 0 : wb;
+                for(auto i_h = 0; i_h < nhead; i_h++)
+                {
+                    AccDataType current_slope = alibi_slope_host(i_b_slope, i_h);
+                    alibi_host.slope          = alibi_host.mode == ck_tile::AlibiMode::VERTICAL
+                                                    ? current_slope
+                                                    : -current_slope;
+                    for(auto i_r = 0; i_r < real_seqlen_q; i_r++)
+                    {
+                        for(auto i_c = 0; i_c < real_seqlen_k; i_c++)
+                        {
+                            AccDataType pixel = 0;
+                            alibi_host.update(pixel, i_r, i_c);
+                            alibi_bias_host_ref(i_h, i_r, i_c) = pixel;
+                        }
+                    }
+                }
+                // [nhead, real_seqlen_q, real_seqlen_k]
+                ck_tile::reference_batched_elementwise<AccDataType,
+                                                       AccDataType,
+                                                       AccDataType,
+                                                       AccDataType>(
+                    s_host_ref, alibi_bias_host_ref, s_host_ref);
+            }
+
+            if(mask.type == mask_enum::no_mask)
+            {
+                ck_tile::reference_batched_masking<AccDataType>(
+                    s_host_ref, FmhaMasks::NoMask{real_seqlen_q, real_seqlen_k});
+            }
+            else if(mask.type == mask_enum::window_generic)
+            {
+                ck_tile::reference_batched_masking<AccDataType>(
+                    s_host_ref,
+                    ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::GenericMask>(
+                        mask.left, mask.right, real_seqlen_q, real_seqlen_k));
+            }
+            else
+            {
+                // if left window size is negative, means causal
+                // else means generic (for current batch)
+                if(mask.left < 0)
+                    ck_tile::reference_batched_masking<AccDataType>(
+                        s_host_ref,
+                        ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::CausalMask>(
+                            mask.left,
+                            mask.right,
+                            real_seqlen_q,
+                            real_seqlen_k,
+                            mask.type == mask_enum::mask_top_left));
+                else
+                    ck_tile::reference_batched_masking<AccDataType>(
+                        s_host_ref,
+                        ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::GenericMask>(
+                            mask.left,
+                            mask.right,
+                            real_seqlen_q,
+                            real_seqlen_k,
+                            mask.type == mask_enum::mask_top_left));
+            }
+            const ck_tile::HostTensor<AccDataType> masked_s_host_ref = s_host_ref;
+            ck_tile::reference_batched_softmax<AccDataType, LSEDataType, AccDataType>(
+                s_host_ref, p_hp_host_ref, ck_tile::identity{}, lse_host_ref);
+
+            if(p_drop > 0)
+            {
+                p_dropped_hp_host_ref = p_hp_host_ref;
+                ck_tile::reference_batched_dropout_randval(
+                    randval_host_ref, wb, drop_seed, drop_offset);
+                ck_tile::reference_batched_dropout(
+                    p_dropped_hp_host_ref, randval_host_ref, p_undrop_in_uint8_t, rp_undrop);
+                p_lp_host_ref = p_dropped_hp_host_ref.template CopyAsType<GemmDataType>();
+
+                ck_tile::HostTensor<RandValOutputDataType> randval_host_result(
+                    {nhead, real_seqlen_q, real_seqlen_k});
+                randval_host_result.ForEach([&](auto& self, const auto& idx) {
+                    self(idx) = randval_host(b, idx[0], idx[1] + query_offset, idx[2]);
+                });
+                masked_s_host_ref.ForEach([&](const auto& self, const auto& idx) {
+                    // Ignore all masked values in validation check
+                    if(std::isinf(self(idx)))
+                    {
+                        randval_host_ref(idx)    = 0;
+                        randval_host_result(idx) = 0;
+                    }
+                });
+                bool cur_pass = ck_tile::check_err(randval_host_result,
+                                                   randval_host_ref,
+                                                   "DROPOUT RANDVAL Error: Incorrect results!");
+                pass &= cur_pass;
+                if(!cur_pass)
+                {
+                    break;
+                }
+            }
+            else
+            {
+                p_lp_host_ref = p_hp_host_ref.template CopyAsType<GemmDataType>();
+            }
+
+            // O = P * V
+            ck_tile::reference_batched_gemm<GemmDataType, VDataType, AccDataType, ODataType>(
+                p_lp_host_ref, v_host_ref, o_host_ref); // o_g_m_o = p_lp_g_m_n@v_g_o_n
+
+            // clang-format off
+            // permute
+            if(o_perm) o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[0], idx[1] + query_offset, idx[2]) = self(idx); });
+            else       o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[1] + query_offset, idx[0], idx[2]) = self(idx); });
+
+            lse_host_ref.ForEach([&](auto& self, auto idx) { lse_host(b, idx[0], idx[1] + query_offset) = self(idx); });
+            // clang-format on
+
+            q_host_refs.push_back(q_host_ref);
+            k_host_refs.push_back(k_host_ref);
+            v_host_refs.push_back(v_host_ref);
+            o_host_refs.push_back(o_host_ref);
+            p_hp_host_refs.push_back(p_hp_host_ref);
+            p_lp_host_refs.push_back(p_lp_host_ref);
+            if(p_drop > 0)
+            {
+                randval_host_refs.push_back(randval_host_ref);
+            }
+        }
+
+        // set to bad values to check if the kernel writes to these buffers
+        ck_tile::FillConstant<QGradDataType>{ck_tile::numeric<QGradDataType>::infinity()}(dq_host);
+        ck_tile::FillConstant<KGradDataType>{ck_tile::numeric<KGradDataType>::infinity()}(dk_host);
+        ck_tile::FillConstant<VGradDataType>{ck_tile::numeric<VGradDataType>::infinity()}(dv_host);
+        ck_tile::FillConstant<AccDataType>{ck_tile::numeric<AccDataType>::infinity()}(dq_acc_host);
+        dq_buf.ToDevice(dq_host.data());
+        dk_buf.ToDevice(dk_host.data());
+        dv_buf.ToDevice(dv_host.data());
+        dq_acc_buf.ToDevice(dq_acc_host.data());
+
+        o_buf.ToDevice(o_host.data());
+        lse_buf.ToDevice(lse_host.data());
+        dbias_buf.SetZero();
+
+        // non-deterministic kernels use atomic add to write dq
+        // Some block may be skipped with causal mask and dq are not set to zeros
+        // In these cases thus we need to zero out it first
+        if(!deterministic || mask.type != mask_enum::no_mask)
+            dq_acc_buf.SetZero();
+
+        ck_tile::stream_config stream_config_v{nullptr, true, 0, 0, 1};
+        fmha_bwd(fmha_traits, fmha_args, stream_config_v);
+
+        dq_buf.FromDevice(dq_host.data());
+        dk_buf.FromDevice(dk_host.data());
+        dv_buf.FromDevice(dv_host.data());
+        dbias_buf.FromDevice(dbias_host.data());
+
+        for(ck_tile::index_t wb = 0; wb < batch; ++wb)
+        {
+            const ck_tile::index_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
+            const ck_tile::index_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
+
+            // adjust matrix index according to the mode
+            const ck_tile::index_t b = (mode == mode_enum::batch ? wb : 0);
+            const ck_tile::index_t query_offset =
+                (mode == mode_enum::batch ? 0 : seqstart_q_host[wb]);
+            const ck_tile::index_t key_offset =
+                (mode == mode_enum::batch ? 0 : seqstart_k_host[wb]);
+
+            ck_tile::HostTensor<OGradDataType> do_host_ref(
+                {nhead, real_seqlen_q, hdim_v}); // do_g_m_o
+            ck_tile::HostTensor<AccDataType> ds_hp_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k}); // ds_g_m_n high precision
+            ck_tile::HostTensor<GemmDataType> ds_lp_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k}); // ds_g_m_n low precision
+            ck_tile::HostTensor<AccDataType> dp_hp_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k}); // dp_g_m_n high precision
+            ck_tile::HostTensor<BiasGradDataType> dbias_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k}); // dbias_g_m_n
+            ck_tile::HostTensor<QGradDataType> dq_host_ref(
+                {nhead, real_seqlen_q, hdim_q}); // dq_g_m_k
+            ck_tile::HostTensor<KGradDataType> dk_host_ref(
+                {nhead, real_seqlen_k, hdim_q}); // dk_g_n_k
+            ck_tile::HostTensor<VGradDataType> dv_host_ref(
+                {nhead, real_seqlen_k, hdim_v}); // dv_g_n_o
+
+            // clang-format off
+            if(o_perm) do_host_ref.ForEach([&](auto& self, auto i) { self(i) = do_host(b, i[0], i[1] + query_offset, i[2]); });
+            else       do_host_ref.ForEach([&](auto& self, auto i) { self(i) = do_host(b, i[1] + query_offset, i[0], i[2]); });
+            // clang-format on
+
+            // dP = dO@V x Z w/  dropout
+            // dP = dO@V     w/o dropout
+            auto v_t_host_ref = v_host_refs[wb].transpose({0, 2, 1}); // v_g_o_n -> v_g_n_o
+            ck_tile::reference_batched_gemm<OGradDataType, VDataType, AccDataType, AccDataType>(
+                do_host_ref, v_t_host_ref, dp_hp_host_ref); // dp_g_m_n = do_g_m_o@v_g_n_o
+
+            if(p_drop > 0)
+            {
+                ck_tile::reference_batched_dropout(
+                    dp_hp_host_ref, randval_host_refs[wb], p_undrop_in_uint8_t, rp_undrop);
+            }
+
+            // dS_i_j = P_i_j .* (dP_i_j - dO_i dot O_i)
+            ck_tile::make_ParallelTensorFunctor(
+                [&](auto i0, auto i1, auto i2) {
+                    AccDataType do_dot_o = 0;
+                    for(int o = 0; o < hdim_v; o++)
+                    {
+                        do_dot_o += ck_tile::type_convert<AccDataType>(do_host_ref(i0, i1, o)) *
+                                    ck_tile::type_convert<AccDataType>(o_host_refs[wb](i0, i1, o));
+                    }
+                    ds_hp_host_ref(i0, i1, i2) = ck_tile::type_convert<AccDataType>(
+                        p_hp_host_refs[wb](i0, i1, i2) * (dp_hp_host_ref(i0, i1, i2) - do_dot_o));
+                },
+                ds_hp_host_ref.mDesc.get_lengths()[0],
+                ds_hp_host_ref.mDesc.get_lengths()[1],
+                ds_hp_host_ref.mDesc.get_lengths()[2])(std::thread::hardware_concurrency());
+
+            if(use_dbias)
+            {
+                dbias_host_ref = ds_hp_host_ref.template CopyAsType<BiasGradDataType>();
+            }
+
+            ds_lp_host_ref = ds_hp_host_ref.template CopyAsType<GemmDataType>();
+
+            // dV = P_drop^T@dO^T
+            // dV = P^T@dO^T w/o dropout
+            auto p_t_lp_host_ref =
+                p_lp_host_refs[wb].transpose({0, 2, 1});           // p_lp_g_m_n -> p_lp_g_n_m
+            auto do_t_host_ref = do_host_ref.transpose({0, 2, 1}); // do_g_m_o -> do_g_o_m
+            ck_tile::
+                reference_batched_gemm<GemmDataType, OGradDataType, AccDataType, VGradDataType>(
+                    p_t_lp_host_ref, do_t_host_ref, dv_host_ref); // dv_g_n_o = p_lp_g_n_m@do_g_o_m
+
+            // dQ = scale * dS@K^T
+            auto k_t_host_ref = k_host_refs[wb].transpose({0, 2, 1}); // k_g_n_k -> k_g_k_n
+            ck_tile::reference_batched_gemm<GemmDataType, KDataType, AccDataType, QGradDataType>(
+                ds_lp_host_ref,
+                k_t_host_ref,
+                dq_host_ref,
+                ck_tile::identity{},
+                ck_tile::identity{},
+                ck_tile::scales(scale)); // dq_g_m_k = ds_g_m_n@k_g_k_n
+
+            // dK = scale * dS^T@Q^T
+            auto ds_t_lp_host_ref = ds_lp_host_ref.transpose({0, 2, 1});  // ds_g_m_n -> ds_g_n_m
+            auto q_t_host_ref     = q_host_refs[wb].transpose({0, 2, 1}); // q_g_m_k -> q_g_k_m
+            ck_tile::reference_batched_gemm<GemmDataType, QDataType, AccDataType, KGradDataType>(
+                ds_t_lp_host_ref,
+                q_t_host_ref,
+                dk_host_ref,
+                ck_tile::identity{},
+                ck_tile::identity{},
+                ck_tile::scales(scale)); // dk_g_n_k = ds_g_n_m@q_g_k_m
+
+            ck_tile::HostTensor<QGradDataType> dq_host_result(
+                {nhead, real_seqlen_q, hdim_q}); // dq_g_m_k
+            ck_tile::HostTensor<KGradDataType> dk_host_result(
+                {nhead, real_seqlen_k, hdim_q}); // dk_g_n_k
+            ck_tile::HostTensor<VGradDataType> dv_host_result(
+                {nhead, real_seqlen_k, hdim_v}); // dv_g_n_o
+            ck_tile::HostTensor<BiasGradDataType> dbias_host_result(
+                {nhead, real_seqlen_q, real_seqlen_k}); // dbias_g_m_n
+
+            // clang-format off
+            // permute
+            if(i_perm) dq_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dq_host(b, idx[0], idx[1] + query_offset, idx[2]); });
+            else       dq_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dq_host(b, idx[1] + query_offset, idx[0], idx[2]); });
+
+            if(i_perm) dk_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dk_host(b, idx[0], idx[1] + key_offset, idx[2]); });
+            else       dk_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dk_host(b, idx[1] + key_offset, idx[0], idx[2]); });
+
+            if(i_perm) dv_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dv_host(b, idx[0], idx[1] + key_offset, idx[2]); });
+            else       dv_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dv_host(b, idx[1] + key_offset, idx[0], idx[2]); });
+
+            if(use_dbias)
+            {
+                if(i_perm) dbias_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dbias_host(b, idx[0], idx[1] + query_offset, idx[2]); });
+                else       dbias_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dbias_host(b, idx[1] + query_offset, idx[0], idx[2]); });
+            }
+            // clang-format on
+
+            auto [rtol, atol] = get_elimit<DataTypeConfig>(hdim_q, hdim_v);
+            bool dq_cur_pass  = ck_tile::check_err(dq_host_result,
+                                                  dq_host_ref,
+                                                  std::string("Error: QGrad Incorrect results!"),
+                                                  rtol,
+                                                  atol);
+            bool dk_cur_pass  = ck_tile::check_err(dk_host_result,
+                                                  dk_host_ref,
+                                                  std::string("Error: KGrad Incorrect results!"),
+                                                  rtol,
+                                                  atol);
+            bool dv_cur_pass  = ck_tile::check_err(dv_host_result,
+                                                  dv_host_ref,
+                                                  std::string("Error: VGrad Incorrect results!"),
+                                                  rtol,
+                                                  atol);
+
+            bool dbias_cur_pass = true;
+            if(use_dbias)
+            {
+                dbias_cur_pass =
+                    ck_tile::check_err(dbias_host_result,
+                                       dbias_host_ref,
+                                       std::string("Error: BiasGrad Incorrect results!"),
+                                       rtol,
+                                       atol);
+            }
+            pass &= (dq_cur_pass & dk_cur_pass & dv_cur_pass & dbias_cur_pass);
+            if(!(dq_cur_pass & dk_cur_pass & dv_cur_pass & dbias_cur_pass))
+            {
+                std::cerr << "mismatch found at batch: " << wb << std::endl
+                          << "\tseqlen_q: " << real_seqlen_q << std::endl
+                          << "\tseqlen_k: " << real_seqlen_k << std::endl
+                          << "\tseqstart_q: " << seqstart_q_host << std::endl
+                          << "\tseqstart_k: " << seqstart_k_host << std::endl;
+
+                break;
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    if(json)
+    {
+        dump_fmha_bwd_json_results(
+            *json,
+            data_type,
+            mode == mode_enum::batch ? "batch" : "group",
+            i_perm ? "true" : "false",
+            o_perm ? "true" : "false",
+            batch,
+            nhead,
+            nhead_k,
+            seqlen_qs[0],
+            seqlen_ks[0],
+            hdim_q,
+            hdim_v,
+            scale,
+            bias.type == bias_enum::elementwise_bias
+                ? "elementwise_bias"
+                : (bias.type == bias_enum::alibi ? "alibi" : "no_bias"),
+            use_dbias ? "true" : "false",
+            p_drop,
+            s_randval,
+            deterministic,
+            mask.type == mask_enum::no_mask
+                ? "no_mask"
+                : (mask.type == mask_enum::window_generic
+                       ? "window_generic"
+                       : (mask.type == mask_enum::mask_top_left
+                              ? "mask_top_left"
+                              : (mask.type == mask_enum::mask_bottom_right ? "mask_bottom_right"
+                                                                           : "mask_generic"))),
+            mask.left,
+            mask.right,
+            workspace_size_in_megabytes,
+            pass,
+            ave_time,
+            tflops,
+            gb_per_sec);
+    }
+
+    return pass ? bwd_result::success : bwd_result::failure;
+}
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index df1e9e5699..761def6d6a 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -17,6 +17,10 @@
 #include <utility>
 #include <variant>
 
+struct FmhaFwdFp32
+{
+};
+
 struct FmhaFwdFp16
 {
 };
@@ -41,9 +45,29 @@ struct FmhaFwdFp8Bf16
 {
 };
 
+struct FmhaFwdFp8Fp32
+{
+};
+
 template <typename DataType>
 struct FmhaFwdTypeConfig;
 
+template <>
+struct FmhaFwdTypeConfig<FmhaFwdFp32>
+{
+    using QDataType             = float;
+    using KDataType             = float;
+    using VDataType             = float;
+    using BiasDataType          = float;
+    using RandValOutputDataType = uint8_t;
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType          = float; // data type for first gemm accumulation
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
+    using PDataType             = float; // data type for A matrix of second gemm
+    using OaccDataType          = float; // data type for second gemm accumulation
+    using ODataType             = float;
+};
+
 template <>
 struct FmhaFwdTypeConfig<FmhaFwdFp16>
 {
@@ -108,6 +132,38 @@ struct FmhaFwdTypeConfig<FmhaFwdBf8>
     using ODataType             = ck_tile::bf8_t;
 };
 
+template <>
+struct FmhaFwdTypeConfig<FmhaFwdFp8Bf16>
+{
+    using QDataType             = ck_tile::fp8_t;
+    using KDataType             = ck_tile::fp8_t;
+    using VDataType             = ck_tile::fp8_t;
+    using BiasDataType          = float;
+    using RandValOutputDataType = uint8_t;
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType          = float; // data type for first gemm accumulation
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
+    using PDataType             = ck_tile::fp8_t; // data type for A matrix of second gemm
+    using OaccDataType          = float;          // data type for second gemm accumulation
+    using ODataType             = ck_tile::bf16_t;
+};
+
+template <>
+struct FmhaFwdTypeConfig<FmhaFwdFp8Fp32>
+{
+    using QDataType             = ck_tile::fp8_t;
+    using KDataType             = ck_tile::fp8_t;
+    using VDataType             = ck_tile::fp8_t;
+    using BiasDataType          = float;
+    using RandValOutputDataType = uint8_t;
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType          = float; // data type for first gemm accumulation
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
+    using PDataType             = ck_tile::fp8_t; // data type for A matrix of second gemm
+    using OaccDataType          = float;          // data type for second gemm accumulation
+    using ODataType             = float;
+};
+
 struct FmhaMasks
 {
     using NoMask      = ck_tile::GenericAttentionMask<false>;
@@ -126,11 +182,20 @@ struct fmha_fwd_args
     void* lse_ptr;
     void* o_ptr;
 
+    // Optional cumulative sequence length arrays
+    // Batch mode: cu_seqlen_* override effective per-batch lengths (exclude PAD)
+    const ck_tile::index_t* cu_seqlen_q_ptr  = nullptr; // [batch+1]
+    const ck_tile::index_t* cu_seqlen_kv_ptr = nullptr; // [batch+1]
+
     const void* seqstart_q_ptr;
     const void* seqstart_k_ptr;
     const void*
         seqlen_k_ptr; // only used if both 'seqstart_q_ptr' & 'seqstart_k_ptr' are not nullptr
 
+    // Group mode: seqstart_padded_* provide physical starts including PAD (optional)
+    const void* seqstart_padded_q_ptr = nullptr; // [batch+1]
+    const void* seqstart_padded_k_ptr = nullptr; // [batch+1]
+
     ck_tile::index_t seqlen_q;
     ck_tile::index_t seqlen_k;
     ck_tile::index_t batch;
@@ -518,7 +583,9 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.min_seqlen_q,
                                              args.p_drop,
                                              args.s_randval,
-                                             args.drop_seed_offset);
+                                             args.drop_seed_offset,
+                                             args.seqstart_padded_q_ptr,
+                                             args.seqstart_padded_k_ptr);
         }
         else
         { // create batch mode kernel arguments
@@ -564,7 +631,9 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.mask_type,
                                              args.p_drop,
                                              args.s_randval,
-                                             args.drop_seed_offset);
+                                             args.drop_seed_offset,
+                                             args.cu_seqlen_q_ptr,
+                                             args.cu_seqlen_kv_ptr);
         }
     }();
 
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd_runner.hpp
similarity index 50%
rename from example/ck_tile/01_fmha/fmha_fwd.cpp
rename to example/ck_tile/01_fmha/fmha_fwd_runner.hpp
index d0f8e3798c..0703af71e3 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd_runner.hpp
@@ -1,12 +1,13 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "fmha_fwd.hpp"
+#pragma once
+
 #include "ck_tile/host.hpp"
 #include "ck_tile/ref/naive_attention.hpp"
-#include "mask.hpp"
-#include "rotary.hpp"
+#include "fmha_fwd.hpp"
 #include "utils.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 
 #include <array>
 #include <cstring>
@@ -23,126 +24,13 @@
 #error "we should enable fmha_fwd_splitkv() api in order to cooperate with fmha_fwd_appendkv()"
 #endif
 
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+enum class fwd_result
 {
-    using size_type = typename std::vector<T>::size_type;
-
-    os << "[";
-    for(size_type idx = 0; idx < v.size(); ++idx)
-    {
-        if(0 < idx)
-        {
-            os << ", ";
-        }
-        os << v[idx];
-    }
-    return os << "]";
-}
-
-auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "0:no validation, 2:cpu validation, 2:gpu validation(experimental)")
-        .insert("mode", "0", "kernel mode. 0:batch, 1:group")
-        .insert("b", "2", "batch size")
-        .insert("h", "8", "num of head, for q")
-        .insert("h_k",
-                "-1",
-                "num of head, for k/v, -1 means equal to h\n"
-                "if not equal to h, then this is GQA/MQA case")
-        .insert(
-            "s",
-            "3328",
-            "seqlen_q. if group-mode, means the average value of seqlen_q\n"
-            "total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary\n"
-            "also with \"-s=s0,s1,s2...\" comma seperated int to set per batch seqlen(group-mode)")
-        .insert("s_k", "-1", "seqlen_k (including new key/value), -1 means equal to s")
-        .insert("s_knew",
-                "0",
-                "seqlen_k for new key/value, 0 means not to use this at all; "
-                "-1 to choose s_knew in [1, s] randomly.")
-        .insert("s_kpad",
-                "-1",
-                "seqlen_k stride between 2 batches, currently used in group-mode only\n"
-                "for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride\n"
-                "along seqlen, instead of packed. same as xformer kv_padding")
-        .insert("d", "128", "head dim for q, k")
-        .insert("d_v", "-1", "head dim for v, -1 means equal to d")
-        .insert("scale_s",
-                "0",
-                "scale factor of S. 0 means equal to 1/sqrt(hdim).\n"
-                "note when squant=1, this value will be modified by range_q/k")
-        .insert("logits_soft_cap", "0", "attention logits soft capping value.")
-        .insert("range_q", "16", "per-tensor quantization range of q. used if squant=1.")
-        .insert("range_k", "16", "per-tensor quantization range of k. used if squant=1.")
-        .insert("range_v", "16", "per-tensor quantization range of v. used if squant=1.")
-        .insert("range_p", "1", "per-tensor quantization range of p [e^(s-m)]. used if squant=1.")
-        .insert("range_o", "16", "per-tensor quantization range of o (p*v). used if squant=1.")
-        .insert("squant",
-                "auto",
-                "if using static quantization fusion or not. auto: fp8 will default use squant, "
-                "other will not\n"
-                "0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to "
-                "P and O.\n"
-                "calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, "
-                "range_p, range_o")
-        .insert("iperm",
-                "1",
-                "permute input\n"
-                "if true, will be b*h*s*d, else b*s*h*d")
-        .insert("operm", "1", "permute output")
-        .insert("bias",
-                "n",
-                "n or 0, no bias\n"
-                "e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s\n"
-                "a(libi) or 2, alibi with 1*h. a:1, b*h")
-        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
-        .insert("mask",
-                "0",
-                "0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
-                "'t', top-left causal mask, 'b', bottom-r causal mask\n"
-                "'t:l,r', top-left sliding window attn(swa) with FA style left right size\n"
-                "'b:l,r', bottom-r sliding window attn(swa) with FA style left right size\n"
-                "'xt:window_size', xformer style masking from top-left, window_size negative is "
-                "causal, positive is swa\n"
-                "'xb:window_size', xformer style masking from bottom-r, window_size negative is "
-                "causal, positive is swa\n"
-                "'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for "
-                "now)")
-        .insert("vlayout", "r", "r for row-major(seqlen*hdim), c for col-major(hdim*seqlen)")
-        .insert("lse", "0", "0 not store lse, 1 store lse")
-        .insert("kname", "0", "if set to 1 will print kernel name")
-        .insert("init",
-                "uf",
-                "init method. ui, uniform random int, ni, normalized random int\n"
-                "uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, "
-                "quantization")
-        .insert("seed",
-                "11939",
-                "random seed used for initializing input tensors. 0 for "
-                "non-deterministic seed")
-        .insert("p_drop", "0", "0~1 probability of dropout")
-        .insert("drop_seed", "1", "seed for random number generator")
-        .insert("drop_offset", "0", "offset for random number generator")
-        .insert("drop_prefs",
-                "0",
-                "seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
-        .insert(
-            "rotary_dim", "0", "RoPE rotary dimension. rotary_dim <= 0 means not apply RoPE at all")
-        .insert("rotary_interleaved", "1", "whether to apply interleaved RoPE")
-        .insert("num_splits",
-                "1",
-                "# of splits for key/value. 0 to determine actual number by heuristic")
-        .insert("page_block_size", "0", "paged-kvcache block size. 0 means not use paged-kvcahe")
-        .insert("cache_batch_idx", "0", "whether to use index map to the kvcache")
-        .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
+    success,
+    failure,
+    invalid_args,
+    no_instance,
+};
 
 // different threshold for different dtype
 template <typename DataTypeConfig>
@@ -153,6 +41,14 @@ auto get_elimit(std::string /*init_method*/)
     return ck_tile::make_tuple(rtol, atol);
 }
 
+template <>
+auto get_elimit<FmhaFwdFp32>(std::string /*init_method*/)
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
 template <>
 auto get_elimit<FmhaFwdBf16>(std::string /*init_method*/)
 {
@@ -162,20 +58,30 @@ auto get_elimit<FmhaFwdBf16>(std::string /*init_method*/)
 }
 
 template <>
-auto get_elimit<FmhaFwdFp8>(std::string init_method)
+auto get_elimit<FmhaFwdFp8>(std::string /*init_method*/)
 {
-    if(init_method == "ui" || init_method == "ni")
-    {
-        unsigned max_rounding_point_distance = 0;
-        double atol                          = 2e-3;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
-    else
-    {
-        unsigned max_rounding_point_distance = 1;
-        double atol                          = 0.0625;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
+    using TypeConfig  = FmhaFwdTypeConfig<FmhaFwdFp8>;
+    using ODataType   = typename TypeConfig::ODataType;
+    float o_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<ODataType>::max());
+    double rtol       = 0;
+    double atol       = 16 * (o_dtype_max > 240 ? 2 : 1);
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<FmhaFwdFp8Bf16>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1.8e-1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<FmhaFwdFp8Fp32>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1.8e-1;
+    return ck_tile::make_tuple(rtol, atol);
 }
 
 int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int max_splits)
@@ -244,35 +150,76 @@ int override_num_splits_if_necessary(
 }
 
 template <typename DataTypeConfig>
-bool run(const ck_tile::ArgParser& arg_parser)
+fwd_result fmha_fwd_run(mode_enum mode,
+                        ck_tile::index_t batch,
+                        ck_tile::index_t nhead,
+                        ck_tile::index_t nhead_k,
+                        std::vector<ck_tile::index_t> seqlen_qs,
+                        std::vector<ck_tile::index_t> seqlen_ks,
+                        ck_tile::index_t hdim_q,
+                        ck_tile::index_t hdim_v,
+                        ck_tile::index_t seqlen_knew,
+                        std::vector<ck_tile::index_t> seqlen_qpads,
+                        std::vector<ck_tile::index_t> seqlen_kpads,
+                        std::vector<ck_tile::index_t> q_eff_lens_per_batch,
+                        std::vector<ck_tile::index_t> kv_eff_lens_per_batch,
+                        ck_tile::index_t rotary_dim,
+                        bool i_perm,
+                        bool o_perm,
+                        float scale_s,
+                        float logits_soft_cap,
+                        bool is_v_rowmajor,
+                        bool lse,
+                        ck_tile::index_t page_block_size,
+                        bool use_cache_batch_idx,
+                        std::string bias_str,
+                        float p_drop,
+                        uint64_t drop_seed,
+                        uint64_t drop_offset,
+                        bool drop_prefs,
+                        std::string mask_str,
+                        bool squant,
+                        bool is_rotary_interleaved,
+                        ck_tile::index_t num_splits,
+                        std::string init_method,
+                        uint32_t seed,
+                        int do_validation,
+                        const ck_tile::stream_config& stream_config,
+                        std::optional<std::string> json = std::nullopt)
 {
-    std::string data_type    = arg_parser.get_str("prec");
-    int do_validation        = arg_parser.get_int("v");
-    auto mode                = static_cast<mode_enum>(arg_parser.get_uint32("mode"));
-    ck_tile::index_t batch   = arg_parser.get_int("b");
-    ck_tile::index_t nhead   = arg_parser.get_int("h");
-    ck_tile::index_t nhead_k = arg_parser.get_int("h_k");
+    const std::string data_type = []() {
+        if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdFp32>)
+            return "fp32";
+        else if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdFp16>)
+            return "fp16";
+        else if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdBf16>)
+            return "bf16";
+        else if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdFp8>)
+            return "fp8";
+        else if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdBf8>)
+            return "bf8";
+        else if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdFp8Bf16>)
+            return "fp8bf16";
+        else if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdFp8Fp32>)
+            return "fp8fp32";
+        else
+            static_assert(false);
+    }();
+
     if(nhead_k < 0)
         nhead_k = nhead;
-
     if(nhead % nhead_k != 0)
     {
         std::cerr << "nhead:" << nhead << " must be multiple of nhead_k:" << nhead_k << std::endl;
-        return false;
+        return fwd_result::invalid_args;
     }
 
-    std::optional<uint32_t> seed = arg_parser.get_uint32("seed");
-    if(*seed == 0)
-    {
-        seed.reset();
-    }
+    std::mt19937 random_engine(seed != 0 ? seed : std::random_device{}());
+    auto next_seed = [&random_engine]() { return static_cast<unsigned int>(random_engine()); };
 
-    ck_tile::index_t hdim_q = arg_parser.get_int("d");
-    ck_tile::index_t hdim_v = arg_parser.get_int("d_v");
     if(hdim_v < 0)
         hdim_v = hdim_q;
 
-    ck_tile::index_t seqlen_knew = arg_parser.get_int("s_knew");
 #if !CK_TILE_FMHA_FWD_APPENDKV_API
     if(seqlen_knew != 0)
     {
@@ -283,17 +230,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
 #endif
     if(seqlen_knew < 0)
     {
-        seqlen_knew = randint<ck_tile::index_t>(1, arg_parser.get_int("s"), seed);
+        seqlen_knew = randint<ck_tile::index_t>(1, seqlen_qs[0], random_engine);
     }
 
-    ck_tile::index_t rotary_dim = arg_parser.get_int("rotary_dim");
     if constexpr(!(std::is_same_v<DataTypeConfig, FmhaFwdFp16> ||
                    std::is_same_v<DataTypeConfig, FmhaFwdBf16>))
     {
         if(0 < rotary_dim)
         {
             std::cerr << "rotary embedding is only available for data type=fp16|bf16" << std::endl;
-            return false;
+            return fwd_result::invalid_args;
         }
     }
 #if !CK_TILE_FMHA_FWD_APPENDKV_API
@@ -314,15 +260,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
     if(!(rotary_dim <= hdim_q))
     {
         std::cerr << "rotary_dim should be less than or equal to head dim for q" << std::endl;
-        return false;
+        return fwd_result::invalid_args;
     }
     else if(!(rotary_dim % 16 == 0))
     {
         std::cerr << "only rotary dimensions divisible by 16 are currently supported" << std::endl;
-        return false;
+        return fwd_result::invalid_args;
     }
 
-    ck_tile::index_t page_block_size = arg_parser.get_int("page_block_size");
 #if(!(CK_TILE_FMHA_FWD_APPENDKV_API || CK_TILE_FMHA_FWD_SPLITKV_API || \
       CK_TILE_FMHA_FWD_PAGEDKV_API))
     if(0 < page_block_size)
@@ -336,10 +281,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
     {
         std::cerr << "only paged-kvcache block size divisible by 128 are currently supported"
                   << std::endl;
-        return false;
+        return fwd_result::invalid_args;
     }
 
-    bool use_cache_batch_idx = arg_parser.get_bool("cache_batch_idx");
 #if !(CK_TILE_FMHA_FWD_APPENDKV_API || CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API)
     if(use_cache_batch_idx)
     {
@@ -368,14 +312,41 @@ bool run(const ck_tile::ArgParser& arg_parser)
 #endif
     const bool use_kvcache = (need_append_kvcache || use_cache_batch_idx || 0 < page_block_size);
 
-    auto [seqlen_qs, seqlen_ks, seqlen_kpads] =
-        decode_seqlen(mode,
-                      batch,
-                      arg_parser.get_str("s"),
-                      arg_parser.get_str("s_k"),
-                      arg_parser.get_str("s_kpad"),
-                      /*seqlen_k_min=*/0 < seqlen_knew ? seqlen_knew : 0,
-                      need_append_kvcache);
+    // Reject unsupported padding usage in special pipelines (appendkv / splitkv / pagedkv)
+    const bool has_group_padding =
+        (mode == mode_enum::group && (!seqlen_qpads.empty() && seqlen_qpads[0] != -1)) ||
+        (mode == mode_enum::group && (seqlen_kpads[0] >= 0));
+    const bool has_batch_efflens = (mode == mode_enum::batch && (!q_eff_lens_per_batch.empty() ||
+                                                                 !kv_eff_lens_per_batch.empty()));
+    const bool using_appendkv    = (0 < seqlen_knew || 0 < rotary_dim);
+    const bool using_pagedkv     = (0 < page_block_size);
+    const bool using_splitkv     = (num_splits > 1) || use_cache_batch_idx;
+    if((using_appendkv || using_pagedkv || using_splitkv) &&
+       (has_group_padding || has_batch_efflens))
+    {
+        std::cerr << "Padding (physical or effective lengths) is not supported with "
+                     "appendkv/splitkv/pagedkv pipelines"
+                  << std::endl;
+        return fwd_result::invalid_args;
+    }
+
+    std::tie(seqlen_qs, seqlen_ks, seqlen_kpads) =
+        generate_missing_seqlens(mode,
+                                 batch,
+                                 seqlen_qs,
+                                 seqlen_ks,
+                                 seqlen_kpads,
+                                 /*seqlen_k_min=*/0 < seqlen_knew ? seqlen_knew : 0,
+                                 need_append_kvcache,
+                                 random_engine);
+    for(ck_tile::index_t wb = 0; wb < batch; ++wb)
+    {
+        if(seqlen_kpads[wb] > 0 && seqlen_kpads[wb] < seqlen_ks[wb])
+        {
+            std::cerr << "kpad must be greater than or equal to seqlen for k" << std::endl;
+            return fwd_result::invalid_args;
+        }
+    }
     // compute kvcache seqlen_k (before appending knew/vnew)
     auto cache_seqlen_ks = seqlen_ks;
     std::transform(cache_seqlen_ks.begin(),
@@ -384,64 +355,32 @@ bool run(const ck_tile::ArgParser& arg_parser)
                    [&](auto seqlen_k) { return seqlen_k - seqlen_knew; });
 
 #if 0
-    // clang-format off
-    std::cout << "seqlen_qs:"; for(auto xx : seqlen_qs) { std::cout << xx << ","; } std::cout << std::endl;
-    std::cout << "seqlen_ks:"; for(auto xx : seqlen_ks) { std::cout << xx << ","; } std::cout << std::endl;
-    std::cout << "seqlen_kpads:"; for(auto xx : seqlen_kpads) { std::cout << xx << ","; } std::cout << std::endl;
-    // clang-format on
+    std::cout << "seqlen_qs: " << seqlen_qs << std::endl;
+    std::cout << "seqlen_ks: " << seqlen_ks << std::endl;
+    std::cout << "seqlen_kpads: " << seqlen_kpads << std::endl;
+    std::cout << "cache_seqlen_ks: " << cache_seqlen_ks << std::endl;
 #endif
 
-    bool i_perm = arg_parser.get_bool("iperm"); // if true, will be batch * nhead * seqlen * hdim
-    bool o_perm = arg_parser.get_bool("operm"); // if false, will be batch * seqlen * nhead * hdim
-
-    float scale_s = arg_parser.get_float("scale_s");
     if(scale_s == .0f)
         scale_s = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q)); // TODO: q ? v ?
 
-    const float logits_soft_cap = arg_parser.get_float("logits_soft_cap");
+    bias_info bias = bias_info::decode(bias_str);
 
-    std::string squant_str = arg_parser.get_str("squant");
-    bool squant            = [&]() {
-        if(squant_str == "auto")
-        {
-            if(data_type == "fp8")
-                return true;
-            else
-                return false;
-        }
-        else
-            return atoi(squant_str.c_str()) != 0 ? true : false;
-    }();
-
-    std::string vlayout = arg_parser.get_str("vlayout");
-    bool lse            = arg_parser.get_bool("lse");
-
-    bias_info bias = bias_info::decode(arg_parser.get_str("bias"));
-    mask_info mask = mask_info::decode(
-        arg_parser.get_str("mask"), seqlen_qs[0], seqlen_ks[0]); // TODO: we don't need x/y anymore
-
-    float p_drop         = arg_parser.get_float("p_drop");
-    uint64_t drop_seed   = arg_parser.get_uint64("drop_seed");
-    uint64_t drop_offset = arg_parser.get_uint64("drop_offset");
-    bool drop_prefs      = arg_parser.get_bool("drop_prefs");
+    mask_info mask =
+        mask_info::decode(mask_str, seqlen_qs[0], seqlen_ks[0]); // TODO: we don't need x/y anymore
 
     if(p_drop < 0.0f || p_drop > 1.0f)
     {
         std::cerr << "The value of p_drop should be 0~1" << std::endl;
-        return false;
+        return fwd_result::invalid_args;
     }
 
     bool s_randval = false;
-    if(p_drop > 0.0f && do_validation != 0)
+    if(p_drop > 0.0f && do_validation)
     {
         s_randval = true;
     }
 
-    std::string init_method = arg_parser.get_str("init");
-
-    const bool is_rotary_interleaved = arg_parser.get_bool("rotary_interleaved");
-
-    ck_tile::index_t num_splits = arg_parser.get_int("num_splits");
 #if !CK_TILE_FMHA_FWD_SPLITKV_API
     if(num_splits != 1)
     {
@@ -450,21 +389,48 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }
 #endif
 
-    int stream_warmup = arg_parser.get_int("warmup");
-    int stream_repeat = arg_parser.get_int("repeat");
-    bool kname        = arg_parser.get_bool("kname");
-
-    ck_tile::stream_config stream_config{nullptr,
-                                         true,
-                                         /* log_level = */ (kname ? 1 : 0),
-                                         stream_warmup,
-                                         stream_repeat,
-                                         arg_parser.get_str("timer") == std::string("gpu")};
-
     const auto seqstart_q_host              = to_seqstarts(seqlen_qs);
     const auto seqstart_k_host              = to_seqstarts(seqlen_ks);
     const auto seqstart_k_with_padding_host = to_seqstarts(seqlen_kpads);
 
+    // Optional padded Q seqstarts (group-mode only)
+    std::vector<int32_t> seqstart_q_with_padding_host;
+    if(mode == mode_enum::group && !seqlen_qpads.empty() && seqlen_qpads[0] != -1)
+    {
+        if(seqlen_qpads.size() < static_cast<size_t>(batch))
+        {
+            seqlen_qpads.resize(batch, seqlen_qpads.back());
+        }
+        if(seqlen_qpads.size() == static_cast<size_t>(batch))
+        {
+            seqstart_q_with_padding_host = to_seqstarts(
+                ck_tile::span<const int32_t>(seqlen_qpads.data(), seqlen_qpads.size()));
+        }
+    }
+
+    // Optional batch-mode cumulative seqlen overrides
+    std::vector<ck_tile::index_t> cuq_cum, cukv_cum;
+    if(mode == mode_enum::batch)
+    {
+        auto calculate_cumulative = [&](std::vector<ck_tile::index_t>& per_batch_vec,
+                                        std::vector<ck_tile::index_t>& cum_vec) {
+            if(!per_batch_vec.empty() && per_batch_vec[0] != -1)
+            {
+                if(per_batch_vec.size() < static_cast<size_t>(batch))
+                {
+                    per_batch_vec.resize(batch, per_batch_vec.back());
+                }
+                cum_vec.resize(batch + 1);
+                cum_vec[0] = 0;
+                for(int i = 0; i < batch; ++i)
+                    cum_vec[i + 1] = cum_vec[i] + per_batch_vec[i];
+            }
+        };
+
+        calculate_cumulative(q_eff_lens_per_batch, cuq_cum);
+        calculate_cumulative(kv_eff_lens_per_batch, cukv_cum);
+    }
+
     using TypeConfig = FmhaFwdTypeConfig<DataTypeConfig>;
 
     using QDataType             = typename TypeConfig::QDataType;
@@ -479,28 +445,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using OaccDataType          = typename TypeConfig::OaccDataType;
     using ODataType             = typename TypeConfig::ODataType;
 
-    float range_q = arg_parser.get_float("range_q");
-    float range_k = arg_parser.get_float("range_k");
-    float range_v = arg_parser.get_float("range_v");
-    float range_p = arg_parser.get_float("range_p");
-    float range_o = arg_parser.get_float("range_o");
-
-    float q_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<QDataType>::max());
-    float k_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<KDataType>::max());
-    float v_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<VDataType>::max());
-    float p_dtype_max = v_dtype_max; // assume p and v is the same type
-    float o_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<ODataType>::max());
-
-    float scale_p = 1.f;
-    float scale_o = 1.f;
-
-    if(squant)
-    {
-        scale_s = scale_s * (range_q / q_dtype_max) * (range_k / k_dtype_max);
-        scale_p = p_dtype_max / range_p;
-        scale_o = (o_dtype_max / range_o) * (range_p / p_dtype_max) * (range_v / v_dtype_max);
-    }
-
     // accumulation numbers for performance evaluation
     std::size_t flop = 0, num_byte = 0;
     auto max_seqlen_q =
@@ -546,12 +490,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
     if(128 < num_splits)
     {
         std::cerr << "num_splits greater than 128 is not supported" << std::endl;
-        return false;
+        return fwd_result::invalid_args;
     }
 #if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
     if(0 < p_drop && (1 < num_splits || use_kvcache))
     {
-        std::cerr << "dropout is not supoprted by split-kv kernels. ignoring the 'p_drop' option"
+        std::cerr << "dropout is not supported by split-kv kernels. ignoring the 'p_drop' option"
                   << std::endl;
         p_drop = 0.0f;
     }
@@ -568,12 +512,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
             return std::array<ck_tile::index_t, 4>{b, s, h, d};
     };
 
-    bool is_v_rowmajor = vlayout == std::string("r");
-
     // host memory for storing all the tensor elements
     const ck_tile::index_t shape_batch = (mode == mode_enum::batch ? batch : 1);
-    const ck_tile::index_t shape_seqlen_q =
+    // logical(unpadded) total seqlen_q for group; batch uses fixed seqlen
+    const ck_tile::index_t shape_seqlen_q_lse =
         (mode == mode_enum::batch ? seqlen_qs[0] : seqstart_q_host.back());
+    // physical(padded) total seqlen_q for group when s_qpad is provided; else use logical
+    const ck_tile::index_t shape_seqlen_q =
+        (mode == mode_enum::batch
+             ? seqlen_qs[0]
+             : (seqstart_q_with_padding_host.empty() ? seqstart_q_host.back()
+                                                     : seqstart_q_with_padding_host.back()));
     const ck_tile::index_t shape_seqlen_k =
         (mode == mode_enum::batch ? seqlen_ks[0]
                                   : (seqlen_kpads[0] < 0 ? seqstart_k_host.back()
@@ -614,7 +563,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             : std::array<ck_tile::index_t, 2>{1, 1});
 
     auto [rotary_cos_host, rotary_sin_host] = generate_rotary_cos_sin<KDataType>(
-        std::max(shape_seqlen_q, shape_seqlen_k), rotary_dim, seed);
+        std::max(shape_seqlen_q, shape_seqlen_k), rotary_dim, next_seed());
 
     ck_tile::HostTensor<LSEDataType> lse_acc_host(
         1 < num_splits || use_kvcache
@@ -631,7 +580,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     // batch mode of lse data layout is [batch, nhead, seqlen_q]
     // group mode of lse data layout is [nhead, total_seqlen_q]
     ck_tile::HostTensor<LSEDataType> lse_host(
-        lse ? std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q}
+        lse ? std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q_lse}
             : std::array<ck_tile::index_t, 3>{1, 1, 1} /* dummy shape for simplifying code */);
 
     ck_tile::HostTensor<ODataType> o_host(
@@ -648,42 +597,44 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::HostTensor<int32_t> cache_batch_idx_host(use_cache_batch_idx
                                                           ? std::array<ck_tile::index_t, 1>{batch}
                                                           : std::array<ck_tile::index_t, 1>{1});
-
+    float max_o = 5.0;
     if(init_method == "ui" || init_method == "0")
     {
-        ck_tile::FillUniformDistributionIntegerValue<QDataType>{-3.f, 3.f, seed}(q_host);
-        ck_tile::FillUniformDistributionIntegerValue<KDataType>{-3.f, 3.f, seed}(k_host);
-        ck_tile::FillUniformDistributionIntegerValue<KDataType>{-3.f, 3.f, seed}(knew_host);
-        ck_tile::FillUniformDistributionIntegerValue<VDataType>{-3.f, 3.f, seed}(v_host);
-        ck_tile::FillUniformDistributionIntegerValue<VDataType>{-3.f, 3.f, seed}(vnew_host);
-        ck_tile::FillUniformDistributionIntegerValue<BiasDataType>{-3.f, 3.f, seed}(bias_host);
+        ck_tile::FillUniformDistributionIntegerValue<QDataType>{-3.f, 3.f, next_seed()}(q_host);
+        ck_tile::FillUniformDistributionIntegerValue<KDataType>{-3.f, 3.f, next_seed()}(k_host);
+        ck_tile::FillUniformDistributionIntegerValue<KDataType>{-3.f, 3.f, next_seed()}(knew_host);
+        ck_tile::FillUniformDistributionIntegerValue<VDataType>{-3.f, 3.f, next_seed()}(v_host);
+        ck_tile::FillUniformDistributionIntegerValue<VDataType>{-3.f, 3.f, next_seed()}(vnew_host);
+        ck_tile::FillUniformDistributionIntegerValue<BiasDataType>{-3.f, 3.f, next_seed()}(
+            bias_host);
     }
     else if(init_method == "ni")
     {
-        ck_tile::FillNormalDistributionIntegerValue<QDataType>{-3.f, 3.f, seed}(q_host);
-        ck_tile::FillNormalDistributionIntegerValue<KDataType>{-3.f, 3.f, seed}(k_host);
-        ck_tile::FillNormalDistributionIntegerValue<KDataType>{-3.f, 3.f, seed}(knew_host);
-        ck_tile::FillNormalDistributionIntegerValue<VDataType>{-3.f, 3.f, seed}(v_host);
-        ck_tile::FillNormalDistributionIntegerValue<VDataType>{-3.f, 3.f, seed}(vnew_host);
-        ck_tile::FillNormalDistributionIntegerValue<BiasDataType>{-3.f, 3.f, seed}(bias_host);
+        ck_tile::FillNormalDistributionIntegerValue<QDataType>{-3.f, 3.f, next_seed()}(q_host);
+        ck_tile::FillNormalDistributionIntegerValue<KDataType>{-3.f, 3.f, next_seed()}(k_host);
+        ck_tile::FillNormalDistributionIntegerValue<KDataType>{-3.f, 3.f, next_seed()}(knew_host);
+        ck_tile::FillNormalDistributionIntegerValue<VDataType>{-3.f, 3.f, next_seed()}(v_host);
+        ck_tile::FillNormalDistributionIntegerValue<VDataType>{-3.f, 3.f, next_seed()}(vnew_host);
+        ck_tile::FillNormalDistributionIntegerValue<BiasDataType>{-3.f, 3.f, next_seed()}(
+            bias_host);
     }
     else if(init_method == "uf" || init_method == "1")
     {
-        ck_tile::FillUniformDistribution<QDataType>{0.f, 1.f, seed}(q_host);
-        ck_tile::FillUniformDistribution<KDataType>{0.f, 1.f, seed}(k_host);
-        ck_tile::FillUniformDistribution<KDataType>{0.f, 1.f, seed}(knew_host);
-        ck_tile::FillUniformDistribution<VDataType>{0.f, 1.f, seed}(v_host);
-        ck_tile::FillUniformDistribution<VDataType>{0.f, 1.f, seed}(vnew_host);
-        ck_tile::FillUniformDistribution<BiasDataType>{0.f, 1.f, seed}(bias_host);
+        ck_tile::FillUniformDistribution<QDataType>{0.f, 1.f, next_seed()}(q_host);
+        ck_tile::FillUniformDistribution<KDataType>{0.f, 1.f, next_seed()}(k_host);
+        ck_tile::FillUniformDistribution<KDataType>{0.f, 1.f, next_seed()}(knew_host);
+        ck_tile::FillUniformDistribution<VDataType>{0.f, 1.f, next_seed()}(v_host);
+        ck_tile::FillUniformDistribution<VDataType>{0.f, 1.f, next_seed()}(vnew_host);
+        ck_tile::FillUniformDistribution<BiasDataType>{0.f, 1.f, next_seed()}(bias_host);
     }
     else if(init_method == "nf")
     {
-        ck_tile::FillNormalDistribution<QDataType>{0.f, 3.f, seed}(q_host);
-        ck_tile::FillNormalDistribution<KDataType>{0.f, 3.f, seed}(k_host);
-        ck_tile::FillNormalDistribution<KDataType>{0.f, 3.f, seed}(knew_host);
-        ck_tile::FillNormalDistribution<VDataType>{0.f, 3.f, seed}(v_host);
-        ck_tile::FillNormalDistribution<VDataType>{0.f, 3.f, seed}(vnew_host);
-        ck_tile::FillNormalDistribution<BiasDataType>{0.f, 3.f, seed}(bias_host);
+        ck_tile::FillNormalDistribution<QDataType>{0.f, 3.f, next_seed()}(q_host);
+        ck_tile::FillNormalDistribution<KDataType>{0.f, 3.f, next_seed()}(k_host);
+        ck_tile::FillNormalDistribution<KDataType>{0.f, 3.f, next_seed()}(knew_host);
+        ck_tile::FillNormalDistribution<VDataType>{0.f, 3.f, next_seed()}(v_host);
+        ck_tile::FillNormalDistribution<VDataType>{0.f, 3.f, next_seed()}(vnew_host);
+        ck_tile::FillNormalDistribution<BiasDataType>{0.f, 3.f, next_seed()}(bias_host);
     }
     else if(init_method == "tf" || init_method == "2")
     {
@@ -694,20 +645,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
         ck_tile::FillTrigValue<VDataType>{}(vnew_host);
         ck_tile::FillTrigValue<BiasDataType>{}(bias_host);
     }
-    else if(init_method == "ufq" || init_method == "uf:q" ||
-            init_method == "3") // suitable for fp8 quantization
-    {
-        ck_tile::FillUniformDistribution<QDataType>{-q_dtype_max, q_dtype_max, seed}(q_host);
-        ck_tile::FillUniformDistribution<KDataType>{-k_dtype_max, k_dtype_max, seed}(k_host);
-        ck_tile::FillUniformDistribution<KDataType>{-k_dtype_max, k_dtype_max, seed}(knew_host);
-        ck_tile::FillUniformDistribution<VDataType>{-v_dtype_max, v_dtype_max, seed}(v_host);
-        ck_tile::FillUniformDistribution<VDataType>{-v_dtype_max, v_dtype_max, seed}(vnew_host);
-
-        // bias_fp8 = qscale_bias * bias_fp32
-        float qscale_bias = (q_dtype_max / range_q) * (k_dtype_max / range_k);
-        // Assume bias is in [-1.f, 1.f] in original fp32
-        ck_tile::FillUniformDistribution<BiasDataType>{-qscale_bias, qscale_bias, seed}(bias_host);
-    }
     if(bias.type == bias_enum::alibi)
     {
         auto slopes = ck_tile::get_alibi_slopes<SaccDataType>(nhead);
@@ -726,13 +663,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
             }
         }
     }
-    iota_shuffle(block_table_host.begin(), block_table_host.end(), 0);
-    iota_shuffle(cache_batch_idx_host.begin(), cache_batch_idx_host.end(), 0);
+    iota_shuffle(block_table_host.begin(), block_table_host.end(), 0, random_engine);
+    iota_shuffle(cache_batch_idx_host.begin(), cache_batch_idx_host.end(), 0, random_engine);
 
     ck_tile::DeviceMem q_buf(q_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem k_buf(k_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem knew_buf(knew_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem v_buf(v_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem knew_buf(knew_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem vnew_buf(vnew_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem bias_buf(bias_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem lse_acc_buf(lse_acc_host.get_element_space_size_in_bytes());
@@ -741,6 +678,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
     ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
+    ck_tile::DeviceMem seqstart_q_padded_buf(seqstart_q_with_padding_host.empty()
+                                                 ? 0
+                                                 : seqstart_q_with_padding_host.size() *
+                                                       sizeof(int32_t));
+    ck_tile::DeviceMem seqstart_k_padded_buf(
+        seqlen_kpads[0] < 0 ? 0 : seqstart_k_with_padding_host.size() * sizeof(int32_t));
+    ck_tile::DeviceMem cu_seqlen_q_buf(cuq_cum.empty() ? 0
+                                                       : cuq_cum.size() * sizeof(ck_tile::index_t));
+    ck_tile::DeviceMem cu_seqlen_kv_buf(
+        cukv_cum.empty() ? 0 : cukv_cum.size() * sizeof(ck_tile::index_t));
     ck_tile::DeviceMem seqlen_k_buf((mode == mode_enum::batch && use_kvcache) ||
                                             0 <= seqlen_kpads[0]
                                         ? seqlen_ks.size() * sizeof(int32_t)
@@ -756,15 +703,90 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::DeviceMem block_table_buf(block_table_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem cache_batch_idx_buf(cache_batch_idx_host.get_element_space_size_in_bytes());
 
+    float scale_p = 1.f;
+    float scale_o = 1.f;
+    if(squant)
+    {
+        float q_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<QDataType>::max());
+        float k_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<KDataType>::max());
+        float v_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<VDataType>::max());
+        float p_dtype_max = v_dtype_max; // assume p and v is the same type
+        // Q tensor
+        {
+            float max_value = ck_tile::type_convert<float>(ck_tile::numeric<QDataType>::min());
+            q_host.ForEach([&](auto& self, auto idx) {
+                float val = ck_tile::type_convert<float>(self(idx));
+                if(val > max_value)
+                    max_value = val;
+            });
+
+            float scale = q_dtype_max / max_value;
+
+            q_host.ForEach([&](auto& self, auto idx) {
+                float val = ck_tile::type_convert<float>(self(idx));
+                self(idx) = ck_tile::type_convert<QDataType>(val * scale);
+            });
+            scale_s = scale_s / scale;
+        }
+
+        // K tensor
+        {
+            float max_value = ck_tile::type_convert<float>(ck_tile::numeric<KDataType>::min());
+            k_host.ForEach([&](auto& self, auto idx) {
+                float val = ck_tile::type_convert<float>(self(idx));
+                if(val > max_value)
+                    max_value = val;
+            });
+            float scale = k_dtype_max / max_value;
+            k_host.ForEach([&](auto& self, auto idx) {
+                float val = ck_tile::type_convert<float>(self(idx));
+                self(idx) = ck_tile::type_convert<KDataType>(val * scale);
+            });
+            scale_s = scale_s / scale;
+        }
+
+        // V tensor
+        {
+            float max_value = ck_tile::type_convert<float>(ck_tile::numeric<VDataType>::min());
+            v_host.ForEach([&](auto& self, auto idx) {
+                float val = ck_tile::type_convert<float>(self(idx));
+                if(val > max_value)
+                    max_value = val;
+            });
+
+            float scale = k_dtype_max / max_value;
+            v_host.ForEach([&](auto& self, auto idx) {
+                float val = ck_tile::type_convert<float>(self(idx));
+                self(idx) = ck_tile::type_convert<VDataType>(val * scale);
+            });
+
+            scale_o = (1.0 / p_dtype_max) / scale;
+        }
+
+        scale_p = p_dtype_max;
+
+        if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdFp8>)
+        {
+            float o_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<ODataType>::max());
+            scale_o           = scale_o * o_dtype_max / max_o;
+        }
+    }
+
     q_buf.ToDevice(q_host.data());
     k_buf.ToDevice(k_host.data());
-    knew_buf.ToDevice(knew_host.data());
     v_buf.ToDevice(v_host.data());
+    knew_buf.ToDevice(knew_host.data());
     vnew_buf.ToDevice(vnew_host.data());
     bias_buf.ToDevice(bias_host.data());
     seqstart_q.ToDevice(seqstart_q_host.data());
-    seqstart_k.ToDevice(seqlen_kpads[0] < 0 ? seqstart_k_host.data()
-                                            : seqstart_k_with_padding_host.data());
+    // Keep logical starts in seqstart_k; pass padded K via separate pointer
+    seqstart_k.ToDevice(seqstart_k_host.data());
+    seqstart_q_padded_buf.ToDevice(
+        seqstart_q_with_padding_host.empty() ? nullptr : seqstart_q_with_padding_host.data());
+    seqstart_k_padded_buf.ToDevice(seqlen_kpads[0] < 0 ? nullptr
+                                                       : seqstart_k_with_padding_host.data());
+    cu_seqlen_q_buf.ToDevice(cuq_cum.empty() ? nullptr : cuq_cum.data());
+    cu_seqlen_kv_buf.ToDevice(cukv_cum.empty() ? nullptr : cukv_cum.data());
     seqlen_k_buf.ToDevice((mode == mode_enum::batch && use_kvcache) || 0 <= seqlen_kpads[0]
                               ? seqlen_ks.data()
                               : nullptr);
@@ -787,15 +809,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
         else return layout_str(iperm_) + std::string("-") + layout_str(operm_);
     };
     // clang-format on
-    const std::string prec = arg_parser.get_str("prec");
 
-    std::cout << "[" << prec << "|" << mode << "|" << io_layout(i_perm, o_perm) << "] b:" << batch
-              << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_qs[0] << "/" << seqlen_ks[0]
+    std::cout << "[" << data_type << "|" << mode << "|" << io_layout(i_perm, o_perm)
+              << "] b:" << batch << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_qs[0]
+              << "/" << seqlen_ks[0]
               << (seqlen_kpads[0] < 0 ? ""
                                       : (std::string("(") + std::to_string(seqlen_kpads[0]) + ")"))
               << ", d:" << hdim_q << "/" << hdim_v << ", scale_s:" << scale_s << ", bias:" << bias
               << ", p_drop:" << p_drop << ", lse:" << lse << ", squant:" << squant
-              << ", mask:" << mask << ", v:" << vlayout;
+              << ", mask:" << mask << ", v:" << (is_v_rowmajor ? "r" : "c");
 #if CK_TILE_FMHA_FWD_APPENDKV_API
     if(0 < rotary_dim)
     {
@@ -817,6 +839,54 @@ bool run(const ck_tile::ArgParser& arg_parser)
         std::cout << ", cache_batch_idx:" << use_cache_batch_idx;
     }
 #endif
+    // Padding / effective length diagnostic logging
+    auto print_vec = [&](const char* label, const std::vector<int>& v) {
+        if(v.empty())
+            return;
+        std::cout << ", " << label << ":[";
+        for(std::size_t i = 0; i < v.size(); ++i)
+        {
+            if(i)
+                std::cout << ",";
+            std::cout << v[i];
+        }
+        std::cout << "]";
+    };
+
+    if(has_group_padding)
+    {
+        bool has_qpad = !seqstart_q_with_padding_host.empty();
+        bool has_kpad = (seqlen_kpads[0] >= 0);
+        if(has_qpad)
+        {
+            print_vec("q_logical", seqlen_qs);
+            print_vec("q_padded", seqlen_qpads);
+        }
+        if(has_kpad)
+        {
+            print_vec("k_logical", seqlen_ks);
+            print_vec("k_padded", seqlen_kpads);
+        }
+    }
+    else if(has_batch_efflens)
+    {
+        // derive effective lengths from cumulative arrays if present
+        if(!cuq_cum.empty())
+        {
+            std::vector<int> eff_q(batch);
+            for(int b_i = 0; b_i < batch; ++b_i)
+                eff_q[b_i] = static_cast<int>(cuq_cum[b_i + 1] - cuq_cum[b_i]);
+            print_vec("q_eff", eff_q);
+        }
+        if(!cukv_cum.empty())
+        {
+            std::vector<int> eff_kv(batch);
+            for(int b_i = 0; b_i < batch; ++b_i)
+                eff_kv[b_i] = static_cast<int>(cukv_cum[b_i + 1] - cukv_cum[b_i]);
+            print_vec("kv_eff", eff_kv);
+        }
+    }
+
     std::cout << std::flush;
 
     const auto init_traits = [&](auto& traits) {
@@ -847,13 +917,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
             else if constexpr(std::is_same_v<fmha_fwd_pagedkv_traits,
                                              std::decay_t<decltype(traits)>>)
             {
-                traits.use_pagedkv = use_kvcache;
+                traits.use_pagedkv = (0 < page_block_size);
             }
         }
     };
 
     const auto init_args = [&, k_paddings_ = seqlen_kpads](auto& args) {
-        assert(nhead % nhead_k == 0);
         /// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q,
         ///       seqlen_k] in this example, hence both the 'batch_stride_bias' &
         ///       'nhead_stride_bias' are 0.
@@ -901,8 +970,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
         const ck_tile::index_t nhead_stride_bias =
             (i_perm ? 0 * shape_seqlen_q * max_seqlen_k : 0 * max_seqlen_k);
         const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t nhead_stride_lse     = shape_seqlen_q;
-        const ck_tile::index_t nhead_stride_lse_acc = (num_splits * shape_seqlen_q);
+        const ck_tile::index_t nhead_stride_lse     = shape_seqlen_q_lse;
+        const ck_tile::index_t nhead_stride_lse_acc = (num_splits * shape_seqlen_q_lse);
         const ck_tile::index_t nhead_stride_o_acc   = (num_splits * shape_seqlen_q * hdim_v);
         const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
         // setup batch_stride_* arguments
@@ -917,8 +986,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
         const ck_tile::index_t batch_stride_vnew    = (nhead_k * hdim_v * seqlen_knew);
         const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * max_seqlen_k);
         const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t batch_stride_lse     = (nhead * shape_seqlen_q);
-        const ck_tile::index_t batch_stride_lse_acc = (nhead * num_splits * shape_seqlen_q);
+        const ck_tile::index_t batch_stride_lse     = (nhead * shape_seqlen_q_lse);
+        const ck_tile::index_t batch_stride_lse_acc = (nhead * num_splits * shape_seqlen_q_lse);
         const ck_tile::index_t batch_stride_o_acc = (nhead * num_splits * shape_seqlen_q * hdim_v);
         const ck_tile::index_t batch_stride_o     = (nhead * shape_seqlen_q * hdim_v);
         const ck_tile::index_t batch_stride_block_table = (max_num_page_blocks / batch);
@@ -1032,6 +1101,29 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 {
                     args.drop_seed_offset = std::make_pair(drop_seed, drop_offset);
                 }
+
+                // Group-mode: optional physical padded starts for Q/K
+                if(mode == mode_enum::group)
+                {
+                    args.seqstart_padded_q_ptr = (seqstart_q_with_padding_host.empty()
+                                                      ? nullptr
+                                                      : seqstart_q_padded_buf.GetDeviceBuffer());
+                    args.seqstart_padded_k_ptr =
+                        (seqlen_kpads[0] < 0 ? nullptr : seqstart_k_padded_buf.GetDeviceBuffer());
+                }
+
+                // Batch-mode: optional cumulative effective seqlen overrides
+                if(mode == mode_enum::batch)
+                {
+                    args.cu_seqlen_q_ptr  = cuq_cum.empty()
+                                                ? nullptr
+                                                : reinterpret_cast<const ck_tile::index_t*>(
+                                                     cu_seqlen_q_buf.GetDeviceBuffer());
+                    args.cu_seqlen_kv_ptr = cukv_cum.empty()
+                                                ? nullptr
+                                                : reinterpret_cast<const ck_tile::index_t*>(
+                                                      cu_seqlen_kv_buf.GetDeviceBuffer());
+                }
             }
             else if constexpr(std::is_same_v<fmha_fwd_splitkv_args, std::decay_t<decltype(args)>>)
             {
@@ -1071,7 +1163,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         }
     };
 
-    const float appendkv_ave_time = [&] {
+    auto run_appendkv = [&](const ck_tile::stream_config& sc) {
 #if CK_TILE_FMHA_FWD_APPENDKV_API
         if(need_append_kvcache)
         {
@@ -1081,27 +1173,21 @@ bool run(const ck_tile::ArgParser& arg_parser)
             fmha_fwd_appendkv_args fwd_appendkv_args;
             init_args(fwd_appendkv_args);
 
-            return fmha_fwd_appendkv(fwd_appendkv_traits, fwd_appendkv_args, stream_config);
+            return fmha_fwd_appendkv(fwd_appendkv_traits, fwd_appendkv_args, sc);
         }
 #endif
         return 0.0f;
-    }();
+    };
+    const float appendkv_ave_time = run_appendkv(stream_config);
+    if(appendkv_ave_time < 0.0f)
+    {
+        std::cout << ", not supported yet" << std::flush << std::endl;
+        return fwd_result::no_instance;
+    }
 
-    const float fwd_ave_time = [&] {
-#if CK_TILE_FMHA_FWD_SPLITKV_API
-        if(1 < num_splits && use_kvcache)
-        {
-            fmha_fwd_splitkv_traits fmha_splitkv_traits;
-            init_traits(fmha_splitkv_traits);
-
-            fmha_fwd_splitkv_args fmha_splitkv_args;
-            init_args(fmha_splitkv_args);
-
-            return fmha_fwd_splitkv(fmha_splitkv_traits, fmha_splitkv_args, stream_config);
-        }
-#endif
+    auto run_fwd = [&](const ck_tile::stream_config& sc) {
 #if CK_TILE_FMHA_FWD_PAGEDKV_API
-        if(use_kvcache)
+        if(1 == num_splits && use_kvcache)
         {
             fmha_fwd_pagedkv_traits fmha_pagedkv_traits;
             init_traits(fmha_pagedkv_traits);
@@ -1109,40 +1195,59 @@ bool run(const ck_tile::ArgParser& arg_parser)
             fmha_fwd_pagedkv_args fmha_pagedkv_args;
             init_args(fmha_pagedkv_args);
 
-            return fmha_fwd_pagedkv(fmha_pagedkv_traits, fmha_pagedkv_args, stream_config);
-        }
+            const float ave_time = fmha_fwd_pagedkv(fmha_pagedkv_traits, fmha_pagedkv_args, sc);
+#if CK_TILE_FMHA_FWD_SPLITKV_API
+            // If there is no instance for these args, fallback to fmha_fwd_splitkv
+            if(ave_time >= 0.0f)
+                return ave_time;
+#else
+            return ave_time;
 #endif
+        }
+#endif // CK_TILE_FMHA_FWD_PAGEDKV_API
+#if CK_TILE_FMHA_FWD_SPLITKV_API
+        if(1 < num_splits || use_kvcache)
+        {
+            fmha_fwd_splitkv_traits fmha_splitkv_traits;
+            init_traits(fmha_splitkv_traits);
+
+            fmha_fwd_splitkv_args fmha_splitkv_args;
+            init_args(fmha_splitkv_args);
+
+            return fmha_fwd_splitkv(fmha_splitkv_traits, fmha_splitkv_args, sc);
+        }
+#endif // CK_TILE_FMHA_FWD_SPLITKV_API
         fmha_fwd_traits fmha_traits;
         init_traits(fmha_traits);
 
         fmha_fwd_args fmha_args;
         init_args(fmha_args);
 
-        return fmha_fwd(fmha_traits, fmha_args, stream_config);
-    }();
-
-    if(appendkv_ave_time < 0.0f || fwd_ave_time < 0.0f)
+        return fmha_fwd(fmha_traits, fmha_args, sc);
+    };
+    const float fwd_ave_time = run_fwd(stream_config);
+    if(fwd_ave_time < 0.0f)
     {
         std::cout << ", not supported yet" << std::flush << std::endl;
-        return false;
+        return fwd_result::no_instance;
     }
 
-    const float ave_time = (appendkv_ave_time + fwd_ave_time);
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
-              << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
-              << " GB/s" << std::flush << std::endl;
+    const float ave_time   = appendkv_ave_time + fwd_ave_time;
+    const float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    const float gb_per_sec = num_byte / 1.E6 / ave_time;
+    if(stream_config.time_kernel_)
+    {
+        std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
+                  << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2)
+                  << gb_per_sec << " GB/s" << std::flush;
+    }
 
+    bool pass = true;
     if(do_validation == 0)
     {
         std::cout << std::flush << std::endl;
-        return true;
     }
-    if(do_validation == 2)
+    else if(do_validation == 2)
     {
         // NOTE: use gpu to do validation
         ck_tile::naive_attention_fwd_traits naive_t;
@@ -1188,389 +1293,429 @@ bool run(const ck_tile::ArgParser& arg_parser)
         o_buf.FromDevice(o_host.data()); // TODO: ugly
 
         auto [rtol_, atol_] = get_elimit<DataTypeConfig>(init_method);
-        bool pass_          = ck_tile::check_err(
+        pass                = ck_tile::check_err(
             o_host, o_naive_ref, std::string("OUT Error: Incorrect results!"), rtol_, atol_);
-        std::cout << ", valid:" << (pass_ ? "y" : "n") << std::flush << std::endl;
-        return pass_;
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
-
-    o_buf.FromDevice(o_host.data());
-    lse_buf.FromDevice(lse_host.data());
-    randval_buf.FromDevice(randval_host.data());
-
-    auto p_compute_element_func = [&]() {
-        if constexpr(std::is_same_v<DataTypeConfig, ck_tile::fp8_t>)
-            return ck_tile::scales{scale_p};
-        else
-            return ck_tile::identity{};
-    }();
-
-    auto oacc_element_func = [&]() {
-        if constexpr(std::is_same_v<DataTypeConfig, ck_tile::fp8_t>)
-            return ck_tile::composes(ck_tile::saturates<ck_tile::fp8_t>{},
-                                     ck_tile::scales{scale_o});
-        else
-            return ck_tile::identity{};
-    }();
-
-    float p_undrop = 1.0 - p_drop;
-    uint8_t p_undrop_in_uint8_t =
-        uint8_t(std::floor(p_undrop * std::numeric_limits<uint8_t>::max()));
-    float rp_undrop = 1.0 / p_undrop;
-
-    bool pass = true;
-    for(ck_tile::index_t wb = 0; wb < batch; ++wb)
+    else
     {
-        const ck_tile::index_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
-        const ck_tile::index_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
-
-        // adjust matrix index according to the mode
-        const ck_tile::index_t b_idx = (mode == mode_enum::batch ? wb : 0);
-        const ck_tile::index_t cache_b_idx =
-            (use_cache_batch_idx ? cache_batch_idx_host(b_idx) : b_idx);
-        const ck_tile::index_t query_offset = (mode == mode_enum::batch ? 0 : seqstart_q_host[wb]);
-        const ck_tile::index_t key_offset =
-            (mode == mode_enum::batch
-                 ? 0
-                 : (seqlen_kpads[0] < 0 ? seqstart_k_host[wb] : seqstart_k_with_padding_host[wb]));
-
-        ck_tile::HostTensor<QDataType> q_host_ref({nhead, real_seqlen_q, hdim_q});
-        ck_tile::HostTensor<KDataType> k_host_ref({nhead, real_seqlen_k, hdim_q});
-        ck_tile::HostTensor<VDataType> v_host_ref({nhead, hdim_v, real_seqlen_k});
-        ck_tile::HostTensor<ODataType> o_host_ref({nhead, real_seqlen_q, hdim_v});
-
-        ck_tile::HostTensor<SMPLComputeDataType> s_host_ref({nhead, real_seqlen_q, real_seqlen_k});
-        ck_tile::HostTensor<PDataType> p_host_ref({nhead, real_seqlen_q, real_seqlen_k});
-        ck_tile::HostTensor<SMPLComputeDataType> lse_host_ref({nhead, real_seqlen_q});
-
-        ck_tile::index_t nr = nhead / nhead_k;
-
-        // clang-format off
-        // permute
-        if(i_perm) q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b_idx, i[0], i[1] + query_offset, i[2]); });
-        else       q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b_idx, i[1] + query_offset, i[0], i[2]); });
-
 #if CK_TILE_FMHA_FWD_APPENDKV_API
-        // optionally apply RoPE to the q_host_ref
-        if(0 < rotary_dim)
+        // When rotary embedding is used, the appendkv kernel modifies the q tensor (multiple times
+        // when time_kernel_ is set). We need to reset the q buffer and rerun all kernels.
+        if(0 < rotary_dim && stream_config.time_kernel_)
         {
-            decltype(q_host_ref) q_host_ref_ro(q_host_ref.get_lengths());
-
-            auto [rotary_cos_slice, rotary_sin_slice] =
-                slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], real_seqlen_q);
-
-            ck_tile::reference_batched_rotary_position_embedding(
-                q_host_ref, rotary_cos_slice, rotary_sin_slice, is_rotary_interleaved, q_host_ref_ro,
-                /*use_1_row_sin_cos=*/mask.type == mask_enum::no_mask);
-
-            q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host_ref_ro(i); });
+            const ck_tile::stream_config stream_config2{stream_config.stream_id_, false, 0};
+            q_buf.ToDevice(q_host.data());
+            run_appendkv(stream_config2);
+            run_fwd(stream_config2);
         }
 #endif
-#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
-        if(0 < page_block_size) {
-            if(i_perm) {
-                k_host_ref.ForEach([&](auto& self, auto i) {
-                    self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[0] / nr, i[1] % page_block_size, i[2]);
-                });
-            } else {
-                k_host_ref.ForEach([&](auto& self, auto i) {
-                    self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[1] % page_block_size, i[0] / nr, i[2]);
-                });
+        o_buf.FromDevice(o_host.data());
+        lse_buf.FromDevice(lse_host.data());
+        randval_buf.FromDevice(randval_host.data());
+
+        constexpr bool supports_squant = std::is_same_v<DataTypeConfig, FmhaFwdFp8> ||
+                                         std::is_same_v<DataTypeConfig, FmhaFwdFp8Bf16> ||
+                                         std::is_same_v<DataTypeConfig, FmhaFwdFp8Fp32>;
+
+        auto p_compute_element_func = [&]() {
+            if constexpr(supports_squant)
+                return ck_tile::scales{scale_p};
+            else
+                return ck_tile::identity{};
+        }();
+
+        auto oacc_element_func = [&]() {
+            if constexpr(std::is_same_v<ODataType, ck_tile::fp8_t> && supports_squant)
+                return ck_tile::composes(ck_tile::saturates<ck_tile::fp8_t>{},
+                                         ck_tile::scales{scale_o});
+            else if constexpr(supports_squant)
+                return ck_tile::scales{scale_o};
+            else
+                return ck_tile::identity{};
+        }();
+
+        float p_undrop = 1.0 - p_drop;
+        uint8_t p_undrop_in_uint8_t =
+            uint8_t(std::floor(p_undrop * std::numeric_limits<uint8_t>::max()));
+        float rp_undrop = 1.0 / p_undrop;
+
+        for(ck_tile::index_t wb = 0; wb < batch; ++wb)
+        {
+            ck_tile::index_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
+            ck_tile::index_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
+            if(mode == mode_enum::batch)
+            {
+                if(!cuq_cum.empty())
+                {
+                    real_seqlen_q = cuq_cum[wb + 1] - cuq_cum[wb];
+                }
+                if(!cukv_cum.empty())
+                {
+                    real_seqlen_k = cukv_cum[wb + 1] - cukv_cum[wb];
+                }
             }
-        } else
-#endif
-        {
-            if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[0] / nr, i[1] + key_offset, i[2]); });
-            else       k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[1] + key_offset, i[0] / nr, i[2]); });
-        }
+
+            // adjust matrix index according to the mode
+            const ck_tile::index_t b_idx = (mode == mode_enum::batch ? wb : 0);
+            const ck_tile::index_t cache_b_idx =
+                (use_cache_batch_idx ? cache_batch_idx_host(b_idx) : b_idx);
+            const ck_tile::index_t query_offset =
+                (mode == mode_enum::batch
+                     ? 0
+                     : (seqstart_q_with_padding_host.empty() ? seqstart_q_host[wb]
+                                                             : seqstart_q_with_padding_host[wb]));
+            const ck_tile::index_t key_offset =
+                (mode == mode_enum::batch
+                     ? 0
+                     : (seqlen_kpads[0] < 0 ? seqstart_k_host[wb]
+                                            : seqstart_k_with_padding_host[wb]));
+
+            ck_tile::HostTensor<QDataType> q_host_ref({nhead, real_seqlen_q, hdim_q});
+            ck_tile::HostTensor<KDataType> k_host_ref({nhead, real_seqlen_k, hdim_q});
+            ck_tile::HostTensor<VDataType> v_host_ref({nhead, hdim_v, real_seqlen_k});
+            ck_tile::HostTensor<ODataType> o_host_ref({nhead, real_seqlen_q, hdim_v});
+
+            ck_tile::HostTensor<SMPLComputeDataType> s_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k});
+            ck_tile::HostTensor<PDataType> p_host_ref({nhead, real_seqlen_q, real_seqlen_k});
+            ck_tile::HostTensor<SMPLComputeDataType> lse_host_ref({nhead, real_seqlen_q});
+
+            ck_tile::index_t nr = nhead / nhead_k;
+
+            // clang-format off
+            // permute
+            if(i_perm) q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b_idx, i[0], i[1] + query_offset, i[2]); });
+            else       q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b_idx, i[1] + query_offset, i[0], i[2]); });
+                // clang-format on
 
 #if CK_TILE_FMHA_FWD_APPENDKV_API
-        // copy Knew to the end of K
-        if(0 < seqlen_knew)
-        {
-            ck_tile::HostTensor<KDataType> knew_host_ref({nhead, seqlen_knew, hdim_q});
-            if(i_perm) knew_host_ref.ForEach([&](auto& self, auto i) { self(i) = knew_host(wb, i[0] / nr, i[1], i[2]); });
-            else       knew_host_ref.ForEach([&](auto& self, auto i) { self(i) = knew_host(wb, i[1], i[0] / nr, i[2]); });
-
-            // optionally apply RoPE to the knew_host_ref
-            auto* real_knew_host_ref = &knew_host_ref;
-            std::optional<decltype(knew_host_ref)> knew_host_ref_ro;
+            // optionally apply RoPE to the q_host_ref
             if(0 < rotary_dim)
             {
-                knew_host_ref_ro.emplace(knew_host_ref.get_lengths());
+                decltype(q_host_ref) q_host_ref_ro(q_host_ref.get_lengths());
 
-                auto [rotary_cos_slice, rotary_sin_slice] =
-                    slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], seqlen_knew);
+                auto [rotary_cos_slice, rotary_sin_slice] = slice_rotary_cos_sin(
+                    rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], real_seqlen_q);
 
                 ck_tile::reference_batched_rotary_position_embedding(
-                    knew_host_ref,
+                    q_host_ref,
                     rotary_cos_slice,
                     rotary_sin_slice,
                     is_rotary_interleaved,
-                    knew_host_ref_ro.value());
+                    q_host_ref_ro,
+                    /*use_1_row_sin_cos=*/mask.type == mask_enum::no_mask);
 
-                real_knew_host_ref = &knew_host_ref_ro.value();
+                q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host_ref_ro(i); });
             }
-
-            (*real_knew_host_ref).ForEach([&](auto& self, auto i) {
-                k_host_ref(i[0], i[1] + cache_seqlen_ks[wb], i[2]) = self(i);
-            });
-        }
 #endif
 #if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
-        if(0 < page_block_size) {
-            if(is_v_rowmajor) {
-                if(i_perm) {
-                    v_host_ref.ForEach([&](auto& self, auto i) {
-                        self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]);
-                    });
-                } else {
-                    v_host_ref.ForEach([&](auto& self, auto i) {
-                        self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[2] % page_block_size, i[0] / nr, i[1]);
-                    });
-                }
+            if(0 < page_block_size)
+            {
+                // clang-format off
+                if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[0] / nr, i[1] % page_block_size, i[2]); });
+                else       k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[1] % page_block_size, i[0] / nr, i[2]); });
+                // clang-format on
             }
             else
-            {
-                if(i_perm) {
-                    v_host_ref.ForEach([&](auto& self, auto i) {
-                        self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[1], i[2] % page_block_size);
-                    });
-                } else {
-                    v_host_ref.ForEach([&](auto& self, auto i) {
-                        self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[1], i[0] / nr, i[2] % page_block_size);
-                    });
-                }
-            }
-        } else
 #endif
-        {
-            if(is_v_rowmajor) {
-                //                                                             v_host_ref: [nhead, hdim, seq], v_host: [b, h_k, s, d]
-                if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(cache_b_idx, i[0] / nr, i[2] + key_offset, i[1]); });
-                //                                                             v_host_ref: [nhead, hdim, seq], v_host: [b, s, h_k, d]
-                else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(cache_b_idx, i[2] + key_offset, i[0] / nr, i[1]); });
-            }
-            else
             {
-                if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(cache_b_idx, i[0] / nr, i[1], i[2] + key_offset); });
-                else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(cache_b_idx, i[1], i[0] / nr, i[2] + key_offset); });
+                // clang-format off
+                if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[0] / nr, i[1] + key_offset, i[2]); });
+                else       k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[1] + key_offset, i[0] / nr, i[2]); });
+                // clang-format on
             }
-        }
 
 #if CK_TILE_FMHA_FWD_APPENDKV_API
-        // copy Vnew to the end of V
-        if(0 < seqlen_knew)
-        {
-            ck_tile::HostTensor<VDataType> vnew_host_ref({nhead, hdim_v, seqlen_knew});
-            if(is_v_rowmajor)
+            // copy Knew to the end of K
+            if(0 < seqlen_knew)
             {
-                if(i_perm) vnew_host_ref.ForEach([&](auto& self, auto i) { self(i) = vnew_host(wb, i[0] / nr, i[2], i[1]); });
-                else       vnew_host_ref.ForEach([&](auto& self, auto i) { self(i) = vnew_host(wb, i[2], i[0] / nr, i[1]); });
-            }
-            else
-            {
-                if(i_perm) vnew_host_ref.ForEach([&](auto& self, auto i) { self(i) = vnew_host(wb, i[0] / nr, i[1], i[2]); });
-                else       vnew_host_ref.ForEach([&](auto& self, auto i) { self(i) = vnew_host(wb, i[1], i[0] / nr, i[2]); });
-            }
+                ck_tile::HostTensor<KDataType> knew_host_ref({nhead, seqlen_knew, hdim_q});
+                // clang-format off
+                if(i_perm) knew_host_ref.ForEach([&](auto& self, auto i) { self(i) = knew_host(wb, i[0] / nr, i[1], i[2]); });
+                else       knew_host_ref.ForEach([&](auto& self, auto i) { self(i) = knew_host(wb, i[1], i[0] / nr, i[2]); });
+                // clang-format on
 
-            vnew_host_ref.ForEach([&](auto& self, auto i) {
-                v_host_ref(i[0], i[1], i[2] + cache_seqlen_ks[wb]) = self(i);
-            });
-        }
-#endif
-        // clang-format on
-
-        // reference
-        ck_tile::reference_batched_gemm<QDataType, KDataType, SaccDataType, SMPLComputeDataType>(
-            q_host_ref,
-            k_host_ref,
-            s_host_ref,
-            ck_tile::identity{},
-            ck_tile::identity{},
-            ck_tile::scales(scale_s));
-
-        if(0.f < logits_soft_cap)
-        {
-            ck_tile::reference_unary_elementwise<SaccDataType, SaccDataType, SaccDataType>(
-                s_host_ref, s_host_ref, [logits_soft_cap](SaccDataType logits) {
-                    return ck_tile::type_convert<SaccDataType>(
-                        logits_soft_cap *
-                        std::tanhf(ck_tile::type_convert<float>(logits / logits_soft_cap)));
-                });
-        }
-
-        if(bias.type == bias_enum::elementwise_bias)
-        {
-            // elementwise bias
-            ck_tile::HostTensor<BiasDataType> bias_host_ref({1, real_seqlen_q, real_seqlen_k});
-            // clang-format off
-            if(i_perm)
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2]); });
-            else
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2]); });
-            // clang-format on
-
-            // broadcast from [1, real_seqlen_q, real_seqlen_k] to [nhead, real_seqlen_q,
-            // real_seqlen_k]
-            ck_tile::reference_batched_elementwise<SMPLComputeDataType,
-                                                   BiasDataType,
-                                                   SMPLComputeDataType,
-                                                   SMPLComputeDataType>(
-                s_host_ref, bias_host_ref, s_host_ref);
-        }
-        else if(bias.type == bias_enum::alibi)
-        {
-            // alibi construct elementwise bias to verify
-            auto alibi_host = [&]() {
-                if(mask.type != mask_enum::no_mask)
+                // optionally apply RoPE to the knew_host_ref
+                auto* real_knew_host_ref = &knew_host_ref;
+                std::optional<decltype(knew_host_ref)> knew_host_ref_ro;
+                if(0 < rotary_dim)
                 {
-                    return ck_tile::make_alibi_from_lr_mask<SaccDataType, true>(
-                        0,
-                        mask.left,
-                        mask.right,
-                        real_seqlen_q,
-                        real_seqlen_k,
-                        static_cast<ck_tile::GenericAttentionMaskEnum>(mask.type));
+                    knew_host_ref_ro.emplace(knew_host_ref.get_lengths());
+
+                    auto [rotary_cos_slice, rotary_sin_slice] = slice_rotary_cos_sin(
+                        rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], seqlen_knew);
+
+                    ck_tile::reference_batched_rotary_position_embedding(knew_host_ref,
+                                                                         rotary_cos_slice,
+                                                                         rotary_sin_slice,
+                                                                         is_rotary_interleaved,
+                                                                         knew_host_ref_ro.value());
+
+                    real_knew_host_ref = &knew_host_ref_ro.value();
+                }
+
+                (*real_knew_host_ref).ForEach([&](auto& self, auto i) {
+                    k_host_ref(i[0], i[1] + cache_seqlen_ks[wb], i[2]) = self(i);
+                });
+            }
+#endif
+#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
+            if(0 < page_block_size)
+            {
+                if(is_v_rowmajor)
+                {
+                    // clang-format off
+                    if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]); });
+                    else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[2] % page_block_size, i[0] / nr, i[1]); });
+                    // clang-format on
                 }
                 else
                 {
-                    return ck_tile::Alibi<SaccDataType, true>{
-                        0, real_seqlen_q, real_seqlen_k, ck_tile::AlibiMode::FROM_BOTTOM_RIGHT};
-                }
-            }();
-
-            ck_tile::HostTensor<SaccDataType> alibi_bias_host_ref(
-                {nhead, real_seqlen_q, real_seqlen_k});
-            auto i_b_slope = bias.rank_info == 0 ? 0 : wb;
-            for(auto i_h = 0; i_h < nhead; i_h++)
-            {
-                SaccDataType current_slope = alibi_slope_host(i_b_slope, i_h);
-                alibi_host.slope = alibi_host.mode == ck_tile::AlibiMode::VERTICAL ? current_slope
-                                                                                   : -current_slope;
-                for(auto i_r = 0; i_r < real_seqlen_q; i_r++)
-                {
-                    for(auto i_c = 0; i_c < real_seqlen_k; i_c++)
-                    {
-                        SaccDataType pixel = 0;
-                        alibi_host.update(pixel, i_r, i_c);
-                        alibi_bias_host_ref(i_h, i_r, i_c) = pixel;
-                    }
+                    // clang-format off
+                    if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[1], i[2] % page_block_size); });
+                    else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[1], i[0] / nr, i[2] % page_block_size); });
+                    // clang-format on
                 }
             }
-            // [nhead, real_seqlen_q, real_seqlen_k]
-            ck_tile::reference_batched_elementwise<SMPLComputeDataType,
-                                                   SaccDataType,
-                                                   SMPLComputeDataType,
-                                                   SMPLComputeDataType>(
-                s_host_ref, alibi_bias_host_ref, s_host_ref);
-        }
-
-        if(mask.type == mask_enum::no_mask)
-        {
-            ck_tile::reference_batched_masking<SaccDataType>(
-                s_host_ref, FmhaMasks::NoMask{real_seqlen_q, real_seqlen_k});
-        }
-        else if(mask.type == mask_enum::window_generic)
-        {
-            ck_tile::reference_batched_masking<SaccDataType>(
-                s_host_ref,
-                ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::GenericMask>(
-                    mask.left, mask.right, real_seqlen_q, real_seqlen_k));
-        }
-        else
-        {
-            // if left window size is negative, means causal
-            // else means generic (for current batch)
-            if(mask.left < 0)
-                ck_tile::reference_batched_masking<SaccDataType>(
-                    s_host_ref,
-                    ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::CausalMask>(
-                        mask.left,
-                        mask.right,
-                        real_seqlen_q,
-                        real_seqlen_k,
-                        mask.type == mask_enum::mask_top_left));
             else
+#endif
+            {
+                if(is_v_rowmajor)
+                {
+                    // clang-format off
+                    //                                v_host_ref: [nhead, hdim, seq], v_host: [b, h_k, s, d]
+                    if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(cache_b_idx, i[0] / nr, i[2] + key_offset, i[1]); });
+                    //                                v_host_ref: [nhead, hdim, seq], v_host: [b, s, h_k, d]
+                    else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(cache_b_idx, i[2] + key_offset, i[0] / nr, i[1]); });
+                    // clang-format on
+                }
+                else
+                {
+                    // clang-format off
+                    if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(cache_b_idx, i[0] / nr, i[1], i[2] + key_offset); });
+                    else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(cache_b_idx, i[1], i[0] / nr, i[2] + key_offset); });
+                    // clang-format on
+                }
+            }
+
+#if CK_TILE_FMHA_FWD_APPENDKV_API
+            // copy Vnew to the end of V
+            if(0 < seqlen_knew)
+            {
+                ck_tile::HostTensor<VDataType> vnew_host_ref({nhead, hdim_v, seqlen_knew});
+                if(is_v_rowmajor)
+                {
+                    // clang-format off
+                    if(i_perm) vnew_host_ref.ForEach([&](auto& self, auto i) { self(i) = vnew_host(wb, i[0] / nr, i[2], i[1]); });
+                    else       vnew_host_ref.ForEach([&](auto& self, auto i) { self(i) = vnew_host(wb, i[2], i[0] / nr, i[1]); });
+                    // clang-format on
+                }
+                else
+                {
+                    // clang-format off
+                    if(i_perm) vnew_host_ref.ForEach([&](auto& self, auto i) { self(i) = vnew_host(wb, i[0] / nr, i[1], i[2]); });
+                    else       vnew_host_ref.ForEach([&](auto& self, auto i) { self(i) = vnew_host(wb, i[1], i[0] / nr, i[2]); });
+                    // clang-format on
+                }
+
+                vnew_host_ref.ForEach([&](auto& self, auto i) {
+                    v_host_ref(i[0], i[1], i[2] + cache_seqlen_ks[wb]) = self(i);
+                });
+            }
+#endif
+
+            // reference
+            ck_tile::
+                reference_batched_gemm<QDataType, KDataType, SaccDataType, SMPLComputeDataType>(
+                    q_host_ref,
+                    k_host_ref,
+                    s_host_ref,
+                    ck_tile::identity{},
+                    ck_tile::identity{},
+                    ck_tile::scales(scale_s));
+
+            if(0.f < logits_soft_cap)
+            {
+                ck_tile::reference_unary_elementwise<SaccDataType, SaccDataType, SaccDataType>(
+                    s_host_ref, s_host_ref, [logits_soft_cap](SaccDataType logits) {
+                        return ck_tile::type_convert<SaccDataType>(
+                            logits_soft_cap *
+                            std::tanhf(ck_tile::type_convert<float>(logits / logits_soft_cap)));
+                    });
+            }
+
+            if(bias.type == bias_enum::elementwise_bias)
+            {
+                // elementwise bias
+                ck_tile::HostTensor<BiasDataType> bias_host_ref({1, real_seqlen_q, real_seqlen_k});
+                // clang-format off
+                if(i_perm) bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2]); });
+                else       bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2]); });
+                // clang-format on
+
+                // broadcast from [1, real_seqlen_q, real_seqlen_k] to [nhead, real_seqlen_q,
+                // real_seqlen_k]
+                ck_tile::reference_batched_elementwise<SMPLComputeDataType,
+                                                       BiasDataType,
+                                                       SMPLComputeDataType,
+                                                       SMPLComputeDataType>(
+                    s_host_ref, bias_host_ref, s_host_ref);
+            }
+            else if(bias.type == bias_enum::alibi)
+            {
+                // alibi construct elementwise bias to verify
+                auto alibi_host = [&]() {
+                    if(mask.type != mask_enum::no_mask)
+                    {
+                        return ck_tile::make_alibi_from_lr_mask<SaccDataType, true>(
+                            0,
+                            mask.left,
+                            mask.right,
+                            real_seqlen_q,
+                            real_seqlen_k,
+                            static_cast<ck_tile::GenericAttentionMaskEnum>(mask.type));
+                    }
+                    else
+                    {
+                        return ck_tile::Alibi<SaccDataType, true>{
+                            0, real_seqlen_q, real_seqlen_k, ck_tile::AlibiMode::FROM_BOTTOM_RIGHT};
+                    }
+                }();
+
+                ck_tile::HostTensor<SaccDataType> alibi_bias_host_ref(
+                    {nhead, real_seqlen_q, real_seqlen_k});
+                auto i_b_slope = bias.rank_info == 0 ? 0 : wb;
+                for(auto i_h = 0; i_h < nhead; i_h++)
+                {
+                    SaccDataType current_slope = alibi_slope_host(i_b_slope, i_h);
+                    alibi_host.slope           = alibi_host.mode == ck_tile::AlibiMode::VERTICAL
+                                                     ? current_slope
+                                                     : -current_slope;
+                    for(auto i_r = 0; i_r < real_seqlen_q; i_r++)
+                    {
+                        for(auto i_c = 0; i_c < real_seqlen_k; i_c++)
+                        {
+                            SaccDataType pixel = 0;
+                            alibi_host.update(pixel, i_r, i_c);
+                            alibi_bias_host_ref(i_h, i_r, i_c) = pixel;
+                        }
+                    }
+                }
+                // [nhead, real_seqlen_q, real_seqlen_k]
+                ck_tile::reference_batched_elementwise<SMPLComputeDataType,
+                                                       SaccDataType,
+                                                       SMPLComputeDataType,
+                                                       SMPLComputeDataType>(
+                    s_host_ref, alibi_bias_host_ref, s_host_ref);
+            }
+
+            if(mask.type == mask_enum::no_mask)
+            {
+                ck_tile::reference_batched_masking<SaccDataType>(
+                    s_host_ref, FmhaMasks::NoMask{real_seqlen_q, real_seqlen_k});
+            }
+            else if(mask.type == mask_enum::window_generic)
+            {
                 ck_tile::reference_batched_masking<SaccDataType>(
                     s_host_ref,
                     ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::GenericMask>(
-                        mask.left,
-                        mask.right,
-                        real_seqlen_q,
-                        real_seqlen_k,
-                        mask.type == mask_enum::mask_top_left));
-        }
-        if(lse)
-        {
-            ck_tile::reference_batched_softmax<SMPLComputeDataType, SMPLComputeDataType, PDataType>(
-                s_host_ref, p_host_ref, p_compute_element_func, lse_host_ref);
-        }
-        else
-        {
-            ck_tile::reference_batched_softmax<SMPLComputeDataType, SMPLComputeDataType, PDataType>(
-                s_host_ref, p_host_ref, p_compute_element_func);
-        }
+                        mask.left, mask.right, real_seqlen_q, real_seqlen_k));
+            }
+            else
+            {
+                // if left window size is negative, means causal
+                // else means generic (for current batch)
+                if(mask.left < 0)
+                    ck_tile::reference_batched_masking<SaccDataType>(
+                        s_host_ref,
+                        ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::CausalMask>(
+                            mask.left,
+                            mask.right,
+                            real_seqlen_q,
+                            real_seqlen_k,
+                            mask.type == mask_enum::mask_top_left));
+                else
+                    ck_tile::reference_batched_masking<SaccDataType>(
+                        s_host_ref,
+                        ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::GenericMask>(
+                            mask.left,
+                            mask.right,
+                            real_seqlen_q,
+                            real_seqlen_k,
+                            mask.type == mask_enum::mask_top_left));
+            }
+            const ck_tile::HostTensor<SaccDataType> masked_s_host_ref = s_host_ref;
+            if(lse)
+            {
+                ck_tile::
+                    reference_batched_softmax<SMPLComputeDataType, SMPLComputeDataType, PDataType>(
+                        s_host_ref, p_host_ref, p_compute_element_func, lse_host_ref);
+            }
+            else
+            {
+                ck_tile::
+                    reference_batched_softmax<SMPLComputeDataType, SMPLComputeDataType, PDataType>(
+                        s_host_ref, p_host_ref, p_compute_element_func);
+            }
 
-        if(p_drop > 0)
-        {
-            ck_tile::HostTensor<RandValOutputDataType> randval_host_ref(
-                {nhead, real_seqlen_q, real_seqlen_k});
-            randval_host_ref.ForEach([&](auto& self, auto idx) {
-                self(idx) = randval_host(b_idx, idx[0], idx[1] + query_offset, idx[2]);
-            });
-            ck_tile::reference_batched_dropout(
-                p_host_ref, randval_host_ref, p_undrop_in_uint8_t, rp_undrop);
-        }
+            if(p_drop > 0)
+            {
+                ck_tile::HostTensor<RandValOutputDataType> randval_host_ref(
+                    {nhead, real_seqlen_q, real_seqlen_k});
+                ck_tile::reference_batched_dropout_randval(
+                    randval_host_ref, wb, drop_seed, drop_offset);
+                ck_tile::reference_batched_dropout(
+                    p_host_ref, randval_host_ref, p_undrop_in_uint8_t, rp_undrop);
 
-        ck_tile::reference_batched_gemm<PDataType, VDataType, OaccDataType, ODataType>(
-            p_host_ref,
-            v_host_ref,
-            o_host_ref,
-            ck_tile::identity{},
-            ck_tile::identity{},
-            oacc_element_func);
+                ck_tile::HostTensor<RandValOutputDataType> randval_host_result(
+                    {nhead, real_seqlen_q, real_seqlen_k});
+                randval_host_result.ForEach([&](auto& self, const auto& idx) {
+                    self(idx) = randval_host(b_idx, idx[0], idx[1] + query_offset, idx[2]);
+                });
+                masked_s_host_ref.ForEach([&](const auto& self, const auto& idx) {
+                    // Ignore all masked values in validation check
+                    if(std::isinf(self(idx)))
+                    {
+                        randval_host_ref(idx)    = 0;
+                        randval_host_result(idx) = 0;
+                    }
+                });
+                bool cur_pass = ck_tile::check_err(randval_host_result,
+                                                   randval_host_ref,
+                                                   "DROPOUT RANDVAL Error: Incorrect results!");
+                pass &= cur_pass;
+                if(!cur_pass)
+                {
+                    break;
+                }
+            }
 
-        ck_tile::HostTensor<ODataType> o_host_result({nhead, real_seqlen_q, hdim_v});
-        // clang-format off
-        // permute
-        if(o_perm) o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b_idx, idx[0], idx[1] + query_offset, idx[2]); });
-        else       o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b_idx, idx[1] + query_offset, idx[0], idx[2]); });
-        // clang-format on
+            ck_tile::reference_batched_gemm<PDataType, VDataType, OaccDataType, ODataType>(
+                p_host_ref,
+                v_host_ref,
+                o_host_ref,
+                ck_tile::identity{},
+                ck_tile::identity{},
+                oacc_element_func);
 
-        auto [rtol, atol] = get_elimit<DataTypeConfig>(init_method);
-        bool cur_pass     = ck_tile::check_err(
-            o_host_result, o_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
-        pass &= cur_pass;
-        if(!cur_pass)
-        {
-            std::cerr << "OUT mismatch found at batch: " << wb << std::endl
-                      << "\tseqlen_q: " << real_seqlen_q << std::endl
-                      << "\tseqlen_k: " << real_seqlen_k << std::endl
-                      << "\tseqstart_q: " << seqstart_q_host << std::endl
-                      << "\tseqstart_k: " << seqstart_k_host << std::endl;
-
-            break;
-        }
-
-        if(lse)
-        {
-            ck_tile::HostTensor<SMPLComputeDataType> lse_host_result({nhead, real_seqlen_q});
-            lse_host_result.ForEach([&](auto& self, auto idx) {
-                self(idx) = lse_host(b_idx, idx[0], idx[1] + query_offset);
-            });
-
-            cur_pass = ck_tile::check_err(lse_host_result,
-                                          lse_host_ref,
-                                          "LSE Error: Incorrect results!",
-                                          rtol,
-                                          atol,
-                                          /* allow_infinity_ref = */ true);
+            ck_tile::HostTensor<ODataType> o_host_result({nhead, real_seqlen_q, hdim_v});
+            // clang-format off
+            // permute
+            if(o_perm) o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b_idx, idx[0], idx[1] + query_offset, idx[2]); });
+            else       o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b_idx, idx[1] + query_offset, idx[0], idx[2]); });
+            // clang-format on
 
+            auto [rtol, atol] = get_elimit<DataTypeConfig>(init_method);
+            bool cur_pass     = ck_tile::check_err(o_host_result,
+                                               o_host_ref,
+                                               std::string("OUT Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
             pass &= cur_pass;
             if(!cur_pass)
             {
-                std::cerr << "LSE mismatch found at batch: " << wb << std::endl
+                std::cerr << "OUT mismatch found at batch: " << wb << std::endl
                           << "\tseqlen_q: " << real_seqlen_q << std::endl
                           << "\tseqlen_k: " << real_seqlen_k << std::endl
                           << "\tseqstart_q: " << seqstart_q_host << std::endl
@@ -1578,33 +1723,67 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
                 break;
             }
+
+            if(lse)
+            {
+                ck_tile::HostTensor<SMPLComputeDataType> lse_host_result({nhead, real_seqlen_q});
+                const ck_tile::index_t query_offset_lse =
+                    (mode == mode_enum::batch ? 0 : seqstart_q_host[wb]);
+                lse_host_result.ForEach([&](auto& self, auto idx) {
+                    self(idx) = lse_host(b_idx, idx[0], idx[1] + query_offset_lse);
+                });
+
+                cur_pass = ck_tile::check_err(lse_host_result,
+                                              lse_host_ref,
+                                              "LSE Error: Incorrect results!",
+                                              rtol,
+                                              atol,
+                                              /* allow_infinity_ref = */ true);
+
+                pass &= cur_pass;
+                if(!cur_pass)
+                {
+                    std::cerr << "LSE mismatch found at batch: " << wb << std::endl
+                              << "\tseqlen_q: " << real_seqlen_q << std::endl
+                              << "\tseqlen_k: " << real_seqlen_k << std::endl
+                              << "\tseqstart_q: " << seqstart_q_host << std::endl
+                              << "\tseqstart_k: " << seqstart_k_host << std::endl;
+
+                    break;
+                }
+            }
         }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
-    std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    if(json)
+    {
+        dump_fmha_fwd_json_results(*json,
+                                   data_type,
+                                   mode == mode_enum::batch ? "batch" : "group",
+                                   io_layout(i_perm, o_perm),
+                                   batch,
+                                   nhead,
+                                   nhead_k,
+                                   seqlen_qs[0],
+                                   seqlen_ks[0],
+                                   seqlen_kpads[0],
+                                   hdim_q,
+                                   hdim_v,
+                                   scale_s,
+                                   p_drop,
+                                   lse,
+                                   squant,
+                                   bias.type == bias_enum::elementwise_bias
+                                       ? "elementwise_bias"
+                                       : (bias.type == bias_enum::alibi ? "alibi" : "no_bias"),
+                                   is_v_rowmajor ? "r" : "c",
+                                   pass,
+                                   ave_time,
+                                   tflops,
+                                   gb_per_sec);
+    }
 
-    return pass;
-}
-
-int main(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
-    {
-        return run<FmhaFwdFp16>(arg_parser) ? 0 : -2;
-    }
-    else if(data_type == "bf16")
-    {
-        return run<FmhaFwdBf16>(arg_parser) ? 0 : -2;
-    }
-    else if(data_type == "fp8")
-    {
-        return run<FmhaFwdFp8>(arg_parser) ? 0 : -2;
-    }
-
-    return -3;
+    return pass ? fwd_result::success : fwd_result::failure;
 }
diff --git a/example/ck_tile/01_fmha/fmha_fwd_v3.hpp b/example/ck_tile/01_fmha/fmha_fwd_v3.hpp
index 5361d27f0f..4bd1d1a367 100644
--- a/example/ck_tile/01_fmha/fmha_fwd_v3.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd_v3.hpp
@@ -34,7 +34,8 @@ struct fmha_fwd_v3_args
 
     index_t window_size_left;
     index_t window_size_right;
-    index_t mask_type;
+    index_t mask_type; // should be 0 for no mask; or 2 for causal mask (window_size_left < 0 and
+                       // window_size_right == 0).
 
     const void* q_ptr;
     index_t stride_q;
@@ -55,6 +56,11 @@ struct fmha_fwd_v3_args
     index_t stride_o;
     index_t nhead_stride_o;
     index_t batch_stride_o;
+
+    // Optional batch-mode cumulative seqlen overrides (exclude PAD)
+    // If provided, they override per-batch effective lengths to skip tail padding.
+    const ck_tile::index_t* cu_seqlen_q_ptr  = nullptr; // [batch+1]
+    const ck_tile::index_t* cu_seqlen_kv_ptr = nullptr; // [batch+1]
 };
 
 std::ostream& operator<<(std::ostream& stream, const fmha_fwd_v3_args::data_type_enum& data_type);
diff --git a/example/ck_tile/01_fmha/fmha_fwd_v3_impl.hpp b/example/ck_tile/01_fmha/fmha_fwd_v3_impl.hpp
index d6e4ac4c60..194675f962 100644
--- a/example/ck_tile/01_fmha/fmha_fwd_v3_impl.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd_v3_impl.hpp
@@ -18,6 +18,7 @@
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
 
 #include "fmha_fwd_v3.hpp"
+#include "mask.hpp"
 
 #define INST_FMHA_FWD_V3_DISPATCH(kernel_traits)                                               \
     template <>                                                                                \
@@ -79,7 +80,7 @@ struct fmha_fwd_v3_kernel_traits
                                             -1     // kBlockPerCu
                                             >;
 
-    using fmha_mask = SimplifiedGenericAttentionMask<IsMasking>;
+    using fmha_mask = GenericAttentionMask<IsMasking, /*IsLocal=*/false>;
 
     using fmha_pipeline_problem =
         BlockFmhaFwdV3PipelineProblem<typename fmha_fwd_v3_problem_traits<date_type>::qkvp_dtype,
@@ -112,6 +113,22 @@ struct fmha_fwd_v3_kernel_traits
 template <typename Kernel>
 float fmha_fwd_v3_kernel_launch(const fmha_fwd_v3_args& args, const stream_config& config)
 {
+    /// NOTICE: This was borrowed from Aiter. Make sure the selected remap_opt setting truly
+    /// maximizes the kernel's performance.
+    int remap_opt = 2;
+    if(args.mask_type != static_cast<int>(mask_enum::no_mask) &&
+       ((args.nhead_q % 8 != 0) || (16384 < args.seqlen_q)))
+    {
+        if(65536 <= args.seqlen_q)
+        {
+            remap_opt = 0;
+        }
+        else
+        {
+            remap_opt = 1;
+        }
+    }
+
     auto kargs = Kernel::MakeKargs(args.q_ptr,
                                    args.k_ptr,
                                    args.v_ptr,
@@ -140,7 +157,10 @@ float fmha_fwd_v3_kernel_launch(const fmha_fwd_v3_args& args, const stream_confi
                                    args.batch_stride_o,
                                    args.window_size_left,
                                    args.window_size_right,
-                                   args.mask_type);
+                                   args.mask_type,
+                                   remap_opt,
+                                   args.cu_seqlen_q_ptr,
+                                   args.cu_seqlen_kv_ptr);
 
     dim3 grids            = Kernel::GridSize(args.batch, args.nhead_q, args.seqlen_q, args.hdim_v);
     constexpr dim3 blocks = Kernel::BlockSize();
diff --git a/example/ck_tile/01_fmha/mask.hpp b/example/ck_tile/01_fmha/mask.hpp
index af38ff0214..2dfe0e7c52 100644
--- a/example/ck_tile/01_fmha/mask.hpp
+++ b/example/ck_tile/01_fmha/mask.hpp
@@ -39,6 +39,7 @@ struct mask_info
             os << "g(" << y << ":" << x << ")";
         }
     }
+
     static mask_info decode(std::string str, ck_tile::index_t seqlen_q, ck_tile::index_t seqlen_k)
     {
         ck_tile::index_t x_total = seqlen_k;
@@ -54,7 +55,7 @@ struct mask_info
             if(t == "xt" || t == "xb")
             {
                 // xformer style sliding window attn from top-left
-                ck_tile::index_t window_size = atoi(v.c_str());
+                ck_tile::index_t window_size = std::stoi(v);
                 ck_tile::index_t left_size   = -1;
                 ck_tile::index_t right_size  = 0;
                 if(window_size > 0)
@@ -71,18 +72,15 @@ struct mask_info
                 tmp.left  = left_size;
                 tmp.right = right_size;
             }
-            else
+            else if(t == "t" || t == "b" || t == "g")
             {
                 auto found_1 = v.find(",");
                 if(found_1 == std::string::npos)
                 {
-                    printf("not supported value %s, %s\n", v.c_str(), str.c_str());
-                    assert(0);
+                    throw std::invalid_argument("invalid mask value: " + str);
                 }
-                tmp.type            = mask_enum::window_generic;
-                ck_tile::index_t v0 = atoi(v.substr(0, found_1).c_str());
-                ck_tile::index_t v1 = atoi(v.substr(found_1 + 1).c_str());
-                // TODO: some validation
+                ck_tile::index_t v0 = std::stoi(v.substr(0, found_1));
+                ck_tile::index_t v1 = std::stoi(v.substr(found_1 + 1));
                 if(t == "t")
                 {
                     tmp.type = mask_enum::mask_top_left;
@@ -105,53 +103,45 @@ struct mask_info
                 }
                 else if(t == "g")
                 {
+                    tmp.type  = mask_enum::window_generic;
                     tmp.y     = v0;
                     tmp.x     = v1;
                     tmp.left  = v0; // TODO: don't use this?
                     tmp.right = v1;
                 }
-                else
-                {
-                    printf("not supported type %s, %s\n", t.c_str(), str.c_str());
-                    assert(0);
-                }
             }
+            else
+            {
+                throw std::invalid_argument("invalid mask value: " + str);
+            }
+        }
+        else if(str == "0")
+        {
+            tmp.type = mask_enum::no_mask;
+        }
+        else if(str == "1" || str == "t")
+        {
+            tmp.type  = mask_enum::mask_top_left;
+            tmp.y     = seqlen_q;
+            tmp.x     = 1;
+            tmp.left  = -1;
+            tmp.right = 0;
+        }
+        else if(str == "2" || str == "b")
+        {
+            tmp.type  = mask_enum::mask_bottom_right;
+            tmp.y     = seqlen_q;
+            tmp.x     = seqlen_k - seqlen_q + 1;
+            tmp.left  = -1;
+            tmp.right = 0;
         }
         else
         {
-            auto set_causal_top_left = [&]() {
-                tmp.type  = mask_enum::mask_top_left;
-                tmp.y     = seqlen_q;
-                tmp.x     = 1;
-                tmp.left  = -1;
-                tmp.right = 0;
-            };
-            auto set_causal_bottom_right = [&]() {
-                tmp.type  = mask_enum::mask_bottom_right;
-                tmp.y     = seqlen_q;
-                tmp.x     = seqlen_k - seqlen_q + 1;
-                tmp.left  = -1;
-                tmp.right = 0;
-            };
-            if(str == "t")
-                set_causal_top_left();
-            else if(str == "b")
-                set_causal_bottom_right();
-            else
-            {
-                tmp.type = static_cast<mask_enum>(atoi(str.c_str()));
-                if(tmp.type == mask_enum::mask_top_left)
-                {
-                    set_causal_top_left();
-                }
-                else if(tmp.type == mask_enum::mask_bottom_right)
-                {
-                    set_causal_bottom_right();
-                }
-            }
+            throw std::invalid_argument("invalid mask value: " + str);
         }
         return tmp;
     }
+
     ck_tile::index_t get_unmaskarea() const
     {
         if(type == mask_enum::no_mask)
@@ -168,6 +158,7 @@ struct mask_info
         }
         return area;
     }
+
     friend std::ostream& operator<<(std::ostream& os, const mask_info& mi)
     {
         mi.serialize(os);
diff --git a/example/ck_tile/01_fmha/script/benchmark_fwd.sh b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
index 88c16cceb6..31ad800039 100755
--- a/example/ck_tile/01_fmha/script/benchmark_fwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
@@ -18,3 +18,36 @@ $EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kn
 done
 done
 done
+
+#Padding Benchmarks: batch mode (baseline vs low/med/high pad)
+prec="fp16"
+base_batch_args="-prec=$prec -mode=0 -b=4 -h=16 -h_k=16 -d=128 -s=1024 -bias=n -mask=0 -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=$VALID"
+
+# baseline (no pad)
+$EXE $base_batch_args
+
+# low pad (≈90–95% effective)
+$EXE $base_batch_args -q_eff_lens=1024,960,992,896 -kv_eff_lens=1024,960,992,896
+
+# medium pad (≈60–75% effective)
+$EXE $base_batch_args -q_eff_lens=896,768,512,640 -kv_eff_lens=896,768,512,640
+
+# high pad (≈30–40% effective)
+$EXE $base_batch_args -q_eff_lens=512,384,256,320 -kv_eff_lens=512,384,256,320
+
+# Padding Benchmarks: group mode (baseline vs low/med/high physical pad)
+seqlens_q="1024,768,512,256"
+seqlens_k="1024,768,512,256"
+base_group_args="-prec=$prec -mode=1 -b=4 -h=16 -h_k=16 -d=128 -s=$seqlens_q -s_k=$seqlens_k -bias=n -mask=0 -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=$VALID"
+
+# baseline (no physical pad)
+$EXE $base_group_args
+
+# low physical pad
+$EXE $base_group_args -s_qpad=1152,896,576,320 -s_kpad=1152,896,576,320
+
+# medium physical pad
+$EXE $base_group_args -s_qpad=1536,1152,768,384 -s_kpad=1536,1152,768,384
+
+# high physical pad
+$EXE $base_group_args -s_qpad=2048,1536,1024,512 -s_kpad=2048,1536,1024,512
diff --git a/example/ck_tile/01_fmha/script/benchmark_fwd_v3.sh b/example/ck_tile/01_fmha/script/benchmark_fwd_v3.sh
index 9c500edf9d..a3f7d68eb3 100755
--- a/example/ck_tile/01_fmha/script/benchmark_fwd_v3.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd_v3.sh
@@ -8,24 +8,35 @@ for prec in "fp16" "bf16" ; do
 for hdim in 128 ; do
 for perm in 0 ; do
 
-if [ $causal -eq 0 ]; then
-    mask=0
-else
-    mask=b:-1,0
-fi
-
-$EXE -prec=$prec -b=32 -h=16        -s=512   -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=16 -h=16        -s=1024  -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=8  -h=16        -s=2048  -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=4  -h=16        -s=4096  -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=2  -h=16        -s=8192  -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=1  -h=16        -s=16384 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=32 -h=16        -s=512   -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=16 -h=16        -s=1024  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=8  -h=16        -s=2048  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=4  -h=16        -s=4096  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=2  -h=16        -s=8192  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=16        -s=16384 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
                                           
-$EXE -prec=$prec -b=1  -h=64        -s=16384 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=1  -h=16 -h_k=1 -s=65536 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=1  -h=40        -s=37200 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=64        -s=16384 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=16 -h_k=1 -s=65536 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=40        -s=37200 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
 
 done
 done
 done
 done
+
+# Padding benchmark comparisons for v3 (batch mode only)
+# ==== V3 Padding Benchmarks: batch mode (baseline vs low/med/high pad) ====
+prec="fp16"
+base_v3_args="-prec=$prec -b=4 -h=16 -d=128 -s=1024 -mask=0 -iperm=0 -operm=0 -v=$VALID"
+
+# baseline (no pad)
+$EXE $base_v3_args
+
+# low pad (≈90–95% effective)
+$EXE $base_v3_args -q_eff_lens=1024,960,992,896 -kv_eff_lens=1024,960,992,896
+
+# medium pad (≈60–75% effective)
+$EXE $base_v3_args -q_eff_lens=896,768,512,640 -kv_eff_lens=896,768,512,640
+
+# high pad (≈30–40% effective)
+$EXE $base_v3_args -q_eff_lens=512,384,256,320 -kv_eff_lens=512,384,256,320
diff --git a/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx90a.txt b/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx90a.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx942.txt b/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx942.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx950.txt b/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx950.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx90a.txt b/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx90a.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx942.txt b/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx942.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx950.txt b/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx950.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh
index e7babd2744..5c2a5a4b3d 100755
--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
@@ -34,15 +34,15 @@ function print_log_header(){
 }
 
 #run verification tests
-example/ck_tile/01_fmha/script/smoke_test_fwd.sh
-example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+time example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+time example/ck_tile/01_fmha/script/smoke_test_bwd.sh
 
 #run performance benchmarks
 export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
 print_log_header $fmha_fwd_log $env_type $branch $host_name
-example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
+time example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
 
 export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log"
 print_log_header $fmha_bwd_log $env_type $branch $host_name
-example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log
+time example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log
 
diff --git a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
index d123f842a2..cd51dde2d4 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
@@ -2,14 +2,46 @@
 # TODO: run this script from CK root or build directory
 set -euo pipefail
 
-EXE="$(find . -name tile_example_fmha_bwd -type f | head -n 1)"
+SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+EXE_NAME=tile_example_fmha_bwd
+EXE="$(find . -name $EXE_NAME -type f | head -n 1)"
 KNAME=1
+GPU_arch=${GPU_arch:-""}
+if [ -z "$GPU_arch" ] ; then
+    GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}')
+fi
 
 export CK_WARMUP=0
 export CK_REPEAT=1
 
+CURR_FAILS_FILE=${CURR_FAILS_FILE:-"fmha_bwd_fails_$GPU_arch.txt"}
+rm -f $CURR_FAILS_FILE
+touch $CURR_FAILS_FILE
+KNOWN_FAILS_FILE=${KNOWN_FAILS_FILE:-"$SCRIPT_DIR/fmha_bwd_known_fails_$GPU_arch.txt"}
+
 COMMON_ARGS='-v=1'
+
+run_exe() {
+    set +ex
+    $EXE $@
+    local ret=$?
+    if [ $ret -ne 0 ] ; then
+        echo "$EXE_NAME $*" >> $CURR_FAILS_FILE
+    fi
+    set -ex
+}
+
+test_h_s_mask() {
+    run_exe -b=1 -h=4 -h_k=2 -s=259                         $@
+    run_exe -b=2 -h=2        -s=516 -s_k=253                $@
+    run_exe -b=1 -h=4 -h_k=1 -s=500 -s_k=251 -mask=1        $@
+    run_exe -b=1 -h=2        -s=900 -s_k=258 -mask=2        $@
+    run_exe -b=2 -h=1        -s=987 -s_k=219 -mask=t:128,30 $@
+    run_exe -b=2 -h=3 -h_k=1 -s=244 -s_k=499 -mask=b:4,35   $@
+}
+
 set -x
+# main tests
 for prec in "fp16" "bf16" ; do
 for perm in 0 1 ; do
 for hdim in 32 64 128 256 ; do
@@ -18,20 +50,41 @@ for bias in "n" "a" ; do
 for dbias in 0 ; do
 for p_drop in 0.0 0.2 ; do
 for deterministic in 0 ; do
+test_h_s_mask -prec=$prec -d=$hdim -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+done
+done
+done
+done
+done
+done
+done
+done
 
-$EXE -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259          -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm                -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=2 -h=2        -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm                -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1        -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=1 -h=2        -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2        -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=2 -h=1        -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35   -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-
-done
-done
-done
-done
-done
-done
-done
+# additional cases
+for hdim in 40 48 72 96 ; do
+test_h_s_mask -prec=fp16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=0 -operm=0 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
+test_h_s_mask -prec=bf16 -d=$hdim -bias=n -dbias=0 -p_drop=0   -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
+test_h_s_mask -prec=bf16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
 done
 set +x
+
+new_fails_count=0
+known_fails_count=0
+if [ -f $KNOWN_FAILS_FILE ] ; then
+    echo "Comparing current fails ($CURR_FAILS_FILE) against known fails ($KNOWN_FAILS_FILE):"
+    while IFS= read -r line; do
+        if grep -Fxq "$line" $KNOWN_FAILS_FILE; then
+            echo "Known fail: $line"
+            known_fails_count=$(($known_fails_count + 1))
+        else
+            echo "New fail: $line"
+            new_fails_count=$(($new_fails_count + 1))
+        fi
+    done < $CURR_FAILS_FILE
+else
+    new_fails_count=$(wc -l < $CURR_FAILS_FILE)
+    echo "No known fails file, all fails ($new_fails_count) are new:"
+    cat $CURR_FAILS_FILE
+fi
+echo "New fails count: $new_fails_count; Known fails count: $known_fails_count"
+exit $(($new_fails_count != 0))
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
index 3913a0d5c2..fca6b8d0cd 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -2,12 +2,23 @@
 # TODO: run this script from CK root or build directory
 set -euo pipefail
 
-EXE="$(find . -name tile_example_fmha_fwd -type f | head -n 1)"
+SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+EXE_NAME=tile_example_fmha_fwd
+EXE="$(find . -name $EXE_NAME -type f | head -n 1)"
 KNAME=1
+GPU_arch=$GPU_arch
+if [ -z "$GPU_arch" ] ; then
+    GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}')
+fi
 
 export CK_WARMUP=0
 export CK_REPEAT=1
 
+CURR_FAILS_FILE=${CURR_FAILS_FILE:-"fmha_fwd_fails_$GPU_arch.txt"}
+rm -f $CURR_FAILS_FILE
+touch $CURR_FAILS_FILE
+KNOWN_FAILS_FILE=${KNOWN_FAILS_FILE:-"$SCRIPT_DIR/fmha_fwd_known_fails_$GPU_arch.txt"}
+
 COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
 # mode=0
 # export HIP_VISIBLE_DEVICES=4
@@ -30,6 +41,16 @@ while getopts ":sa" opt; do
     esac
 done
 
+run_exe() {
+    set +ex
+    $EXE $@
+    local ret=$?
+    if [ $ret -ne 0 ] ; then
+        echo "$EXE_NAME $*" >> $CURR_FAILS_FILE
+    fi
+    set -ex
+}
+
 run_fp16_bf16_tests() {
     local NUM_SPLITS="1"
     local PAGE_BLOCK_SIZE="0"
@@ -52,16 +73,16 @@ run_fp16_bf16_tests() {
     for page_block_size in $PAGE_BLOCK_SIZE ; do
     for cache_batch_idx in $CACHE_BATCH_IDX ; do
 
-    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16    -d_v=$hdim -s=55   -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm                -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=3        -d=$hdim            -s=100  -s_k=51             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm                -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1        -d=16    -d_v=$hdim -s=99   -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=1024 -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1        -d=$hdim -d_v=24    -s=3    -s_k=99             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim            -s=200  -s_k=520            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1        -d=$hdim            -s=99   -s_k=32             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35   -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=33   -s_k=0              -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=1    -s_k=10  -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    # run_exe -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
+    run_exe -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16    -d_v=$hdim -s=55   -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm                -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    run_exe -prec=$prec -mode=$mode -b=1 -h=3        -d=$hdim            -s=100  -s_k=51             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm                -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    run_exe -prec=$prec -mode=$mode -b=2 -h=1        -d=16    -d_v=$hdim -s=99   -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    run_exe -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=1024 -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    run_exe -prec=$prec -mode=$mode -b=2 -h=1        -d=$hdim -d_v=24    -s=3    -s_k=99             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    run_exe -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim            -s=200  -s_k=520            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    run_exe -prec=$prec -mode=$mode -b=2 -h=1        -d=$hdim            -s=99   -s_k=32             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35   -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    run_exe -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=33   -s_k=0              -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    run_exe -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=1    -s_k=10  -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
 
     done ; done ; done ; done ; done
     done ; done ; done ; done ; done
@@ -73,7 +94,29 @@ run_fp8_tests() {
     for b in 1 2 ; do
     for hdim in 64 128 256 ; do
     
-    $EXE -prec=fp8 -init=3 -b=$b -h=1 -d=128 -s=128 -bias=$bias -iperm=$perm -operm=$perm -vlayout=c -squant=1 -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=fp8 -init=0 -b=$b -h=1 -d=128 -s=128 -bias=$bias -iperm=$perm -operm=$perm -vlayout=r -squant=1 -kname=$KNAME $COMMON_ARGS
+
+    done ; done ; done ; done
+}
+
+run_fp8bf16_tests() {
+    for perm in 0 1 ; do
+    for bias in "n" "e" "a" ; do
+    for b in 1 2 ; do
+    for hdim in 64 128 256 ; do
+
+    $EXE -prec=fp8bf16 -init=0 -b=$b -h=1 -d=128 -s=128 -bias=$bias -iperm=$perm -operm=$perm -vlayout=r -squant=1 -kname=$KNAME $COMMON_ARGS
+
+    done ; done ; done ; done
+}
+
+run_fp8fp32_tests() {
+    for perm in 0 1 ; do
+    for bias in "n" "e" "a" ; do
+    for b in 1 2 ; do
+    for hdim in 64 128 256 ; do
+
+    $EXE -prec=fp8fp32 -init=0 -b=$b -h=1 -d=128 -s=128 -bias=$bias -iperm=$perm -operm=$perm -vlayout=r -squant=1 -kname=$KNAME $COMMON_ARGS
 
     done ; done ; done ; done
 }
@@ -88,19 +131,151 @@ run_fp16_appendkv_tests() {
     for page_block_size in 0 128 ; do
     for cache_batch_idx in 0 1 ; do
 
-    $EXE -prec=fp16 -b=3 -h=3 -d=$hdim -s=$s -s_k=$s_k -s_knew=$s_knew -rotary_dim=$rdim -rotary_interleaved=$ri -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -iperm=1 -operm=1 -kname=1 $COMMON_ARGS
+    run_exe -prec=fp16 -b=3 -h=3 -d=$hdim -s=$s -s_k=$s_k -s_knew=$s_knew -rotary_dim=$rdim -rotary_interleaved=$ri -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -iperm=1 -operm=1 -kname=1 $COMMON_ARGS
 
     done ; done ; done ; done ; done 
     done ; done ; done
 }
 
+run_padding_smoke_tests() {
+    # Padding-only smoke tests for batch/group mode using COMMON_ARGS
+    local prec="fp16"
+
+    # Batch mode: padding via effective lengths (exclude PAD)
+    # Use lse=1 to select a non-trload kernel and avoid overly strict tolerance mismatches
+    local base_batch="-prec=$prec -mode=0 -b=4 -h=16 -h_k=16 -d=128 -s=1024 -bias=n -mask=0 -lse=1 -iperm=0 -operm=0 -vlayout=r -kname=$KNAME $COMMON_ARGS"
+    # low pad (≈90–95% effective)
+    $EXE $base_batch -q_eff_lens=1024,960,992,896 -kv_eff_lens=1024,960,992,896
+    # medium pad (≈60–75% effective)
+    $EXE $base_batch -q_eff_lens=896,768,512,640 -kv_eff_lens=896,768,512,640
+    # high pad (≈30–40% effective)
+    $EXE $base_batch -q_eff_lens=512,384,256,320 -kv_eff_lens=512,384,256,320
+
+    # Group mode: padding via physical stride along seqlen
+    local seqlens_q="1024,768,512,256"
+    local seqlens_k="1024,768,512,256"
+    local base_group="-prec=$prec -mode=1 -b=4 -h=16 -h_k=16 -d=128 -s=$seqlens_q -s_k=$seqlens_k -bias=n -mask=0 -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=$KNAME $COMMON_ARGS"
+    # low physical pad
+    $EXE $base_group -s_qpad=1152,896,576,320 -s_kpad=1152,896,576,320
+    # medium physical pad
+    $EXE $base_group -s_qpad=1536,1152,768,384 -s_kpad=1536,1152,768,384
+    # high physical pad
+    $EXE $base_group -s_qpad=2048,1536,1024,512 -s_kpad=2048,1536,1024,512
+}
+
+run_padding_basic_boundary_tests() {
+    # Basic padding and boundary tests (reference: smoke_test_fwd_pad.sh)
+    local prec
+    local perm
+
+    # Group mode: Q&K padded with per-batch different strides
+    for prec in fp16 bf16 ; do
+      for perm in 0 1 ; do
+        $EXE -prec=$prec -mode=1 -b=2 -h=2 -h_k=1 -d=16 -d_v=32 \
+             -s=55 -s_k=256 -s_qpad=64,60 -s_kpad=272,260 \
+             -bias=n -p_drop=0.0 -lse=0 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+      done
+    done
+
+    # slightly larger, uneven padding strides
+    for prec in fp16 bf16 ; do
+      for perm in 0 1 ; do
+        $EXE -prec=$prec -mode=1 -b=3 -h=2 -h_k=1 -d=64 -d_v=64 \
+             -s=50,60,40 -s_k=128,256,192 -s_qpad=64,64,64 -s_kpad=160,288,224 \
+             -bias=n -p_drop=0.0 -lse=1 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+      done
+    done
+
+    # only K padded; Q unpadded
+    for prec in fp16 bf16 ; do
+      for perm in 0 1 ; do
+        $EXE -prec=$prec -mode=1 -b=2 -h=2 -h_k=1 -d=32 -d_v=64 \
+             -s=55 -s_k=256 -s_kpad=272,260 \
+             -bias=n -p_drop=0.0 -lse=1 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+      done
+    done
+
+    # use cu_seqlen overrides to skip tail PAD
+    for prec in fp16 bf16 ; do
+      for perm in 0 1 ; do
+        $EXE -prec=$prec -mode=0 -b=4 -h=8 -h_k=8 -d=128 -s=3 -s_k=3 \
+             -q_eff_lens=1,2,1,2 -kv_eff_lens=1,2,1,2 \
+             -bias=n -p_drop=0.0 -lse=1 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+
+        $EXE -prec=$prec -mode=0 -b=2 -h=2 -h_k=1 -d=32 -d_v=64 -s=64 -s_k=256 \
+             -q_eff_lens=55,60 -kv_eff_lens=200,256 \
+             -bias=n -p_drop=0.0 -lse=0 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+      done
+    done
+
+    # no padding (equal), mixed Q/KV, all len=1
+    for prec in fp16 bf16 ; do
+      $EXE -prec=$prec -mode=0 -b=4 -h=8 -d=64 -s=128 -s_k=128 \
+           -q_eff_lens=128,128,128,128 -kv_eff_lens=128,128,128,128 \
+           -bias=n -p_drop=0.0 -lse=1 -kname=$KNAME $COMMON_ARGS
+
+      $EXE -prec=$prec -mode=0 -b=4 -h=8 -d=64 -s=128 -s_k=128 \
+           -q_eff_lens=10,20,30,40 -kv_eff_lens=40,30,20,10 \
+           -bias=n -p_drop=0.0 -lse=1 -kname=$KNAME $COMMON_ARGS
+
+      $EXE -prec=$prec -mode=0 -b=4 -h=8 -d=64 -s=128 -s_k=128 \
+           -q_eff_lens=1,1,1,1 -kv_eff_lens=1,1,1,1 \
+           -bias=n -p_drop=0.0 -lse=1 -kname=$KNAME $COMMON_ARGS
+    done
+
+    # highly variable logical lengths
+    for prec in fp16 bf16 ; do
+      $EXE -prec=$prec -mode=1 -b=4 -h=4 -d=32 \
+           -s=1,127,3,65 -s_k=1,127,3,65 -s_kpad=128 \
+           -bias=n -p_drop=0.0 -lse=1 -kname=$KNAME $COMMON_ARGS
+    done
+
+    # GQA + Alibi + Causal mask (keep vlayout row-major for fp16/bf16
+    for prec in fp16 bf16 ; do
+      $EXE -prec=$prec -mode=1 -b=2 -h=16 -h_k=4 -d=128 \
+           -s=256,129 -s_k=256,129 -s_kpad=256 \
+           -bias=a -mask=t -lse=1 -iperm=0 -operm=0 -vlayout=r \
+           -kname=$KNAME $COMMON_ARGS
+    done
+}
+
 set -x
 
 run_fp16_bf16_tests
+run_padding_smoke_tests
+run_padding_basic_boundary_tests
 run_fp8_tests
+run_fp8bf16_tests
+run_fp8fp32_tests
 
 if [ $TEST_APPENDKV -eq 1 ] ; then
     run_fp16_appendkv_tests
 fi
 
 set +x
+
+new_fails_count=0
+known_fails_count=0
+if [ -f $KNOWN_FAILS_FILE ] ; then
+    echo "Comparing current fails ($CURR_FAILS_FILE) against known fails ($KNOWN_FAILS_FILE):"
+    while IFS= read -r line; do
+        if grep -Fxq "$line" $KNOWN_FAILS_FILE; then
+            echo "Known fail: $line"
+            known_fails_count=$(($known_fails_count + 1))
+        else
+            echo "New fail: $line"
+            new_fails_count=$(($new_fails_count + 1))
+        fi
+    done < $CURR_FAILS_FILE
+else
+    new_fails_count=$(wc -l < $CURR_FAILS_FILE)
+    echo "No known fails file, all fails ($new_fails_count) are new:"
+    cat $CURR_FAILS_FILE
+fi
+echo "New fails count: $new_fails_count; Known fails count: $known_fails_count"
+exit $(($new_fails_count != 0))
diff --git a/example/ck_tile/01_fmha/utils.hpp b/example/ck_tile/01_fmha/utils.hpp
index faf3f08437..7f44d87180 100644
--- a/example/ck_tile/01_fmha/utils.hpp
+++ b/example/ck_tile/01_fmha/utils.hpp
@@ -1,11 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include <algorithm>
 #include <cstdint>
-#include <cstdlib>
 #include <functional>
 #include <optional>
 #include <ostream>
@@ -28,6 +27,23 @@ std::ostream& operator<<(std::ostream& stream, mode_enum mode)
     return stream << (mode == mode_enum::batch ? "batch" : "group");
 }
 
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    using size_type = typename std::vector<T>::size_type;
+
+    os << "[";
+    for(size_type idx = 0; idx < v.size(); ++idx)
+    {
+        if(0 < idx)
+        {
+            os << ", ";
+        }
+        os << v[idx];
+    }
+    return os << "]";
+}
+
 std::vector<int32_t> to_seqstarts(ck_tile::span<const int32_t> seqlens)
 {
     std::vector<int32_t> seqstarts = {0};
@@ -39,12 +55,13 @@ std::vector<int32_t> to_seqstarts(ck_tile::span<const int32_t> seqlens)
     return seqstarts;
 }
 
+template <typename RandomEngine>
 std::vector<int32_t> generate_seqlens(mode_enum mode,
                                       unsigned count,
                                       int32_t seqlen_avg,
-                                      int32_t seqlen_min = -1, // if not negative, clamp min
-                                      int32_t seqlen_max = -1, // if not negative, clamp max
-                                      std::optional<unsigned> seed = std::nullopt)
+                                      int32_t seqlen_min, // if not negative, clamp min
+                                      int32_t seqlen_max, // if not negative, clamp max
+                                      RandomEngine& random_engine)
 {
     assert(0 < count);
 
@@ -58,7 +75,6 @@ std::vector<int32_t> generate_seqlens(mode_enum mode,
     {
         using size_type = std::vector<int32_t>::size_type;
 
-        std::mt19937 random_engine(seed.has_value() ? *seed : std::random_device{}());
         std::uniform_int_distribution<size_type> idx_dist(0, count - 1);
         auto next_idx = std::bind(idx_dist, std::ref(random_engine));
 
@@ -89,43 +105,31 @@ std::vector<int32_t> generate_seqlens(mode_enum mode,
     return seqlens;
 }
 
-std::vector<int32_t> generate_seqstarts(mode_enum mode,
-                                        unsigned count,
-                                        int32_t seqlen_avg,
-                                        int32_t seqlen_min           = -1,
-                                        int32_t seqlen_max           = -1,
-                                        std::optional<unsigned> seed = std::nullopt)
-{
-    return to_seqstarts(generate_seqlens(mode, count, seqlen_avg, seqlen_min, seqlen_max, seed));
-}
-
 // return random integer generated uniformly in range [low, high]
-template <typename Int = int>
-auto randint(Int low, Int high, std::optional<unsigned> seed = std::nullopt)
-    -> std::enable_if_t<std::is_integral_v<Int>, Int>
+template <typename Int = int, typename RandomEngine>
+auto randint(Int low,
+             Int high,
+             RandomEngine& random_engine) -> std::enable_if_t<std::is_integral_v<Int>, Int>
 {
-    std::mt19937 engine(seed.has_value() ? *seed : std::random_device{}());
     std::uniform_int_distribution<Int> dist(low, high);
-    return dist(engine);
+    return dist(random_engine);
 }
 
 // return random integers generated uniformly in range [low, high]
-template <typename Int, typename ForwardIterator>
+template <typename Int, typename ForwardIterator, typename RandomEngine>
 auto randints(ForwardIterator first,
               ForwardIterator last,
               Int low,
               Int high,
-              std::optional<unsigned> seed = std::nullopt)
-    -> std::enable_if_t<std::is_integral_v<Int>>
+              RandomEngine& random_engine) -> std::enable_if_t<std::is_integral_v<Int>>
 {
-    std::mt19937 engine(seed.has_value() ? *seed : std::random_device{}());
     std::uniform_int_distribution<Int> dist(low, high);
 
-    std::generate(first, last, [&] { return dist(engine); });
+    std::generate(first, last, [&] { return dist(random_engine); });
 }
 
 /*
- * decode the seqlen string from cmdline
+ * generate missing values in *_val randomly when the number of values is smaller than batch
  * example (assume batch=3)
  *   q_val=1,2,3 k_val=4,5,6 -> OK
  *   q_val=1,2,3             -> OK, k same as q
@@ -136,23 +140,23 @@ auto randints(ForwardIterator first,
  *   q_val=1,2   k_val=4,5,6 -> not OK, k must have same splits with q
  *   q_val=1,2   k_val=4     -> not OK, k must have same splits with q
  */
+template <typename RandomEngine>
 std::tuple<std::vector<ck_tile::index_t>,
            std::vector<ck_tile::index_t>,
            std::vector<ck_tile::index_t>>
-decode_seqlen(mode_enum mode,
-              ck_tile::index_t batch,
-              std::string q_val,
-              std::string k_val,
-              std::string k_pad_val,
-              ck_tile::index_t seqlen_k_min = 0,
-              bool need_append_kvcache      = false,
-              std::optional<unsigned> seed  = std::nullopt)
+generate_missing_seqlens(mode_enum mode,
+                         ck_tile::index_t batch,
+                         const std::vector<ck_tile::index_t>& q_val,
+                         const std::vector<ck_tile::index_t>& k_val,
+                         const std::vector<ck_tile::index_t>& k_pad_val,
+                         ck_tile::index_t seqlen_k_min,
+                         bool need_append_kvcache,
+                         RandomEngine& random_engine)
 {
-#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
     if(mode == mode_enum::batch)
     {
-        ck_tile::index_t q = _S2I_(q_val);
-        ck_tile::index_t k = _S2I_(k_val);
+        ck_tile::index_t q = q_val[0];
+        ck_tile::index_t k = k_val[0];
 
         auto s_q = std::vector<ck_tile::index_t>(batch, q);
         auto s_k = [&] {
@@ -166,7 +170,7 @@ decode_seqlen(mode_enum mode,
                          seqlen_ks.end(),
                          seqlen_k_min,
                          seqlen_k_max,
-                         seed);
+                         random_engine);
                 return seqlen_ks;
             }
 
@@ -187,25 +191,19 @@ decode_seqlen(mode_enum mode,
     }
     else
     {
-        ck_tile::index_t idx          = 0;
-        std::string::size_type pos_q  = 0;
-        std::string::size_type pos_k  = 0;
-        std::string::size_type pos_kp = 0;
         std::vector<ck_tile::index_t> s_q;
         std::vector<ck_tile::index_t> s_k;
         std::vector<ck_tile::index_t> s_kpad;
-        while(true)
+        ck_tile::index_t idx = 0;
+        for(; idx < std::min(static_cast<ck_tile::index_t>(q_val.size()), batch); ++idx)
         {
-            auto found_q  = q_val.find(',', pos_q);
-            auto found_k  = k_val.find(',', pos_k);
-            auto found_kp = k_pad_val.find(',', pos_kp);
-
-            ck_tile::index_t q = _S2I_(
-                q_val.substr(pos_q, found_q == std::string::npos ? found_q : found_q - pos_q));
-            ck_tile::index_t k = _S2I_(
-                k_val.substr(pos_k, found_k == std::string::npos ? found_k : found_k - pos_k));
-            ck_tile::index_t kp = _S2I_(k_pad_val.substr(
-                pos_kp, found_kp == std::string::npos ? found_kp : found_kp - pos_kp));
+            ck_tile::index_t q = q_val[idx];
+            ck_tile::index_t k =
+                k_val[std::min(idx, static_cast<ck_tile::index_t>(k_val.size()) - 1)];
+            ck_tile::index_t kp =
+                k_pad_val.empty()
+                    ? -1
+                    : k_pad_val[std::min(idx, static_cast<ck_tile::index_t>(k_pad_val.size()) - 1)];
 
             s_q.push_back(q);
             s_k.push_back(k < 0 ? q : k);
@@ -219,21 +217,13 @@ decode_seqlen(mode_enum mode,
                     << ") is less than minimum seqlen_k (=" << seqlen_k_min << ")";
                 throw std::runtime_error(msg.str());
             }
-
-            idx++;
-            if(found_q == std::string::npos || idx >= batch)
-            {
-                break;
-            }
-            pos_q  = found_q + 1;
-            pos_k  = found_k == std::string::npos ? pos_k : found_k + 1;
-            pos_kp = found_kp == std::string::npos ? pos_kp : found_kp + 1;
         }
         if(idx < batch)
         {
-            auto rem_q = generate_seqlens(mode, batch - idx, s_q.back(), 1, s_kpad.back(), seed);
-            auto rem_k =
-                generate_seqlens(mode, batch - idx, s_k.back(), seqlen_k_min, s_kpad.back(), seed);
+            auto rem_q =
+                generate_seqlens(mode, batch - idx, s_q.back(), 1, s_q.back(), random_engine);
+            auto rem_k = generate_seqlens(
+                mode, batch - idx, s_k.back(), seqlen_k_min, s_kpad.back(), random_engine);
 
             s_q.insert(s_q.end(), rem_q.begin(), rem_q.end());
             s_k.insert(s_k.end(), rem_k.begin(), rem_k.end());
@@ -241,26 +231,14 @@ decode_seqlen(mode_enum mode,
         }
         return std::make_tuple(s_q, s_k, s_kpad);
     }
-#undef _S2I_
 }
 
-int env_get_int(const char* var_name, int default_int)
-{
-    char* v = getenv(var_name);
-    int r   = default_int;
-    if(v)
-        r = std::atoi(v);
-    return r;
-}
-
-template <typename RandomAccessIterator, typename Int>
+template <typename RandomAccessIterator, typename Int, typename RandomEngine>
 std::enable_if_t<std::is_integral_v<Int>> iota_shuffle(RandomAccessIterator first,
                                                        RandomAccessIterator last,
                                                        Int value,
-                                                       std::optional<unsigned> seed = std::nullopt)
+                                                       RandomEngine& random_engine)
 {
     std::iota(first, last, value);
-
-    std::mt19937 engine(seed.has_value() ? *seed : std::random_device{}());
-    std::shuffle(first, last, engine);
+    std::shuffle(first, last, random_engine);
 }
diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md
index da74e2e3c1..3de48263f8 100644
--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -65,6 +65,8 @@ args:
      -fquant    fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant (default:0)
      -warmup    cold iter (default:5)
      -repeat    hot iter (default:20)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:layernorm2d_fwd.json)
 
 ```
 
diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index c4366f6662..b7512b2999 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -75,54 +75,17 @@ struct layernorm2d_fwd_traits_
     using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
     using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
-            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
-
     using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
     using Vector     = ck_tile::sequence<1, Vector_N_>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
 
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN           = kPadN_;
     static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
index bdd5f2da1b..54f4e66336 100644
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -1,5 +1,6 @@
 #include "ck_tile/host.hpp"
 #include "layernorm2d_fwd.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 #include <algorithm>
 #include <cstring>
 
@@ -53,7 +54,9 @@ auto create_args(int argc, char* argv[])
         .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
         .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
         .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("repeat", "20", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "layernorm2d_fwd.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -405,6 +408,24 @@ bool run(const ck_tile::ArgParser& arg_parser)
         std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_layernorm2d_fwd_json_results(arg_parser.get_str("jsonfile"),
+                                          prec_i,
+                                          prec_o,
+                                          prec_sm,
+                                          prec_sy,
+                                          m,
+                                          n,
+                                          x_stride,
+                                          xr_stride,
+                                          y_stride,
+                                          yr_stride,
+                                          pass,
+                                          ave_time,
+                                          0,
+                                          gb_per_sec);
+    }
     return pass;
 }
 
diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index 825cd6e522..d2112a67bf 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -2,6 +2,7 @@ add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
 add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp)
 add_executable(tile_example_gemm_reduce EXCLUDE_FROM_ALL gemm_splitk_two_stage_reduce.cpp)
+add_executable(tile_example_gemm_splitk_two_stage EXCLUDE_FROM_ALL gemm_splitk_two_stage.cpp)
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
 set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
@@ -16,3 +17,4 @@ target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OP
 target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+target_compile_options(tile_example_gemm_splitk_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md
index 6358b76fd9..f4e0bb696c 100644
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -9,11 +9,11 @@ mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
 ../script/cmake-ck-dev.sh  ../ <arch>
 # The basic pipeline method on the gemm calculation
-make tile_example_gemm_basic -j
+make tile_example_gemm_basic -j`nproc`
 # The memory bound pipeline on the gemm calculation
-make tile_example_gemm_universal -j
+make tile_example_gemm_universal -j`nproc`
 # The weight preshuffle pipeline on the gemm calculation
-make tile_example_gemm_weight_preshuffle -j
+make tile_example_gemm_weight_preshuffle -j`nproc`
 ```
 This will result in an executable `build/bin/tile_example_gemm_basic` & `build/bin/tile_example_gemm_universal`
 
@@ -30,11 +30,13 @@ args:
    -stride_b    Tensor B stride (default:0)
    -stride_c    Tensor C stride (default:0)
           -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
-       -prec    data type. fp16/bf16/fp8/bf8/int8 (default:fp16)
-     -warmup    number of iterations before benchmark the kernel (default:10)
+       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+     -warmup    number of iterations before benchmark the kernel (default:50)
      -repeat    number of iterations to benchmark the kernel (default:100)
       -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
     -split_k    splitK value (default:1)
-       -init    0:random, 1:linear, 2:constant (default:1)
+       -init    0:random, 1:linear, 2:constant(1) (default:0)
  -persistent    0:non-persistent, 1:persistent (default:0)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:gemm.json)
 ```
diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 99c943a7f1..d687e35f5d 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -2,185 +2,9 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_utils.hpp"
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          bool Persistent,
-          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
-
-{
-    if constexpr(Persistent)
-        std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
-
-    // This part comes from the Codegen
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-#if CK_TILE_USE_WMMA
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 16;
-    constexpr ck_tile::index_t N_Warp_Tile = 16;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-#else
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-#endif
-
-    using CodegenGemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
-
-    using CodegenGemmTraits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
-                                                      GemmConfig::kPadN,
-                                                      GemmConfig::kPadK,
-                                                      ALayout,
-                                                      BLayout,
-                                                      CLayout>;
-
-    using CodegenPipelineProblem = ck_tile::
-        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-
-    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             ck_tile::tuple<>,
-                                             AccDataType,
-                                             CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             ck_tile::element_wise::PassThrough,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation>>;
-
-        // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
-        // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
-        using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenGemmShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    if(args.k_batch == 1)
-    {
-        return Run(MemoryOpSet{});
-    }
-    else
-    {
-        return Run(MemoryOpAtomicAdd{});
-    }
-}
-
 #include "run_gemm_example.inc"
-
-template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout,
-                               std::string b_layout,
-                               ck_tile::ArgParser& arg_parser)
-{
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
-    {
-        if(a_layout == "R" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                arg_parser, Row{}, Col{}, Row{});
-        }
-        else if(a_layout == "C" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                arg_parser, Col{}, Col{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported memory layout for the input matrices when "
-                                     "BPrecType is ck_tile::pk_int4_t!");
-        }
-    }
-    else
-    {
-        if(a_layout == "R" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                arg_parser, Row{}, Col{}, Row{});
-        }
-        else if(a_layout == "R" && b_layout == "R")
-        {
-            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                arg_parser, Row{}, Row{}, Row{});
-        }
-        else if(a_layout == "C" && b_layout == "R")
-        {
-            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                arg_parser, Col{}, Row{}, Row{});
-        }
-        else if(a_layout == "C" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                arg_parser, Col{}, Col{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported memory layout for the input matrices!");
-        }
-    }
-}
+#include "run_gemm_example_common.hpp"
+#include "gemm_basic_invoker.hpp"
 
 int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
@@ -188,36 +12,53 @@ int run_gemm_example(ck_tile::ArgParser& arg_parser)
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
 
+    using GemmConfig = GemmConfigBase;
+    using Invoker    = BasicInvoker;
+
     if(data_type == "fp16")
     {
-        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, arg_parser);
+        return run_gemm_example_prec_type<GemmConfig, Invoker, ck_tile::half_t>(
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf16")
     {
-        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, arg_parser);
+        return run_gemm_example_prec_type<GemmConfig, Invoker, ck_tile::bf16_t>(
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "fp8")
     {
-        return run_gemm_example_prec_type<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(
-            a_layout, b_layout, arg_parser);
+        return run_gemm_example_prec_type<GemmConfig,
+                                          Invoker,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf8")
     {
-        return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
-            a_layout, b_layout, arg_parser);
+        return run_gemm_example_prec_type<GemmConfig,
+                                          Invoker,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "i8")
     {
-        return run_gemm_example_prec_type<ck_tile::int8_t, ck_tile::int8_t, int32_t>(
-            a_layout, b_layout, arg_parser);
+        return run_gemm_example_prec_type<GemmConfig,
+                                          Invoker,
+                                          ck_tile::int8_t,
+                                          ck_tile::int8_t,
+                                          int32_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "pk_int4_t")
     {
         // TODO: Add support for bhalf_t ADataType
-        if constexpr(GemmConfigBase::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        if constexpr(GemmConfig::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
         {
-            return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
-                a_layout, b_layout, arg_parser);
+            return run_gemm_example_prec_type<GemmConfig,
+                                              Invoker,
+                                              ck_tile::half_t,
+                                              ck_tile::pk_int4_t,
+                                              ck_tile::half_t>(a_layout, b_layout, arg_parser);
         }
         else
         {
@@ -232,7 +73,9 @@ int run_gemm_example(ck_tile::ArgParser& arg_parser)
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
     if(!result)
         return -1;
 
diff --git a/example/ck_tile/03_gemm/gemm_basic_invoker.hpp b/example/ck_tile/03_gemm/gemm_basic_invoker.hpp
new file mode 100644
index 0000000000..861374e268
--- /dev/null
+++ b/example/ck_tile/03_gemm/gemm_basic_invoker.hpp
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "gemm_utils.hpp"
+
+struct BasicInvoker
+{
+    template <typename GemmConfig,
+              typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename CLayout,
+              bool Persistent,
+              typename CDEElementWise>
+    static float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+    {
+        if constexpr(Persistent)
+        {
+            std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
+        }
+
+        // This part comes from the Codegen
+        constexpr ck_tile::index_t M_Tile = 256;
+        constexpr ck_tile::index_t N_Tile = 256;
+        constexpr ck_tile::index_t K_Tile = 64;
+
+#if CK_TILE_USE_WMMA
+        constexpr ck_tile::index_t M_Warp = 4;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 16;
+        constexpr ck_tile::index_t N_Warp_Tile = 16;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
+#else
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 32;
+        constexpr ck_tile::index_t N_Warp_Tile = 32;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
+#endif
+
+        using CodegenGemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+        using CodegenGemmTraits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                                          GemmConfig::kPadN,
+                                                          GemmConfig::kPadK,
+                                                          ALayout,
+                                                          BLayout,
+                                                          CLayout>;
+
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
+                                                                    BDataType,
+                                                                    AccDataType,
+                                                                    CodegenGemmShape,
+                                                                    CodegenGemmTraits>;
+
+        using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 ck_tile::tuple<>,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 ck_tile::tuple<>,
+                                                 CLayout,
+                                                 ck_tile::element_wise::PassThrough,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 CodegenPipelineProblem::TransposeC,
+                                                 memory_operation>>;
+
+            // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+            // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << CodegenGemmShape::GetName() << '\n'
+                          << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            // Declare rotating_mem_ptr here so it stays in scope until it is needed
+            std::unique_ptr<ck_tile::RotatingMemWrapper<ADataType, BDataType>> rotating_mem_ptr;
+            std::function<void()> preprocess;
+
+            auto clear_gemm_output = [&]() {
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+
+            if(s.flush_cache_)
+            {
+                std::cout << "Flushing cache..." << std::endl;
+
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+                rotating_mem_ptr =
+                    std::make_unique<ck_tile::RotatingMemWrapper<ADataType, BDataType>>(
+                        kargs.as_ptr[0],
+                        kargs.bs_ptr[0],
+                        s.rotating_count_,
+                        size_a_buffer,
+                        size_b_buffer);
+                rotating_mem_ptr->Print();
+
+                preprocess = [&]() {
+                    ck_tile::flush_icache();
+                    rotating_mem_ptr->Next();
+                    clear_gemm_output();
+                };
+            }
+            else
+            {
+                preprocess = clear_gemm_output;
+            }
+
+            return ck_tile::launch_kernel_time_mask(
+                s,
+                preprocess,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        };
+
+        if(args.k_batch == 1)
+        {
+            return Run(MemoryOpSet{});
+        }
+        else
+        {
+            return Run(MemoryOpAtomicAdd{});
+        }
+    }
+};
diff --git a/example/ck_tile/03_gemm/gemm_splitk_two_stage.cpp b/example/ck_tile/03_gemm/gemm_splitk_two_stage.cpp
new file mode 100644
index 0000000000..b4e0df711b
--- /dev/null
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage.cpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+#include "run_gemm_example_common.hpp"
+#include "gemm_splitk_two_stage_invoker.hpp"
+
+template <template <typename PreType, typename WorkspaceType> typename GemmConfig>
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    using Invoker = SplitKTwoStageInvoker;
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t, float>,
+                                          Invoker,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf16_t, float>,
+                                          Invoker,
+                                          ck_tile::bf16_t>(a_layout, b_layout, arg_parser);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
+    if(!result)
+        return -1;
+
+    try
+    {
+#if CK_TILE_USE_WMMA
+        return !run_gemm_example<GemmConfigTwoStage_Wmma>(arg_parser);
+#else
+        return !run_gemm_example<GemmConfigTwoStage>(arg_parser);
+#endif
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/example/ck_tile/03_gemm/gemm_splitk_two_stage_invoker.hpp b/example/ck_tile/03_gemm/gemm_splitk_two_stage_invoker.hpp
new file mode 100644
index 0000000000..8c7589dabb
--- /dev/null
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_invoker.hpp
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "gemm_utils.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+
+template <typename PrecType_, typename WorkspaceType_>
+struct GemmConfigTwoStage : public GemmConfigComputeV3<PrecType_>
+{
+    using WorkspaceType = ck_tile::remove_cvref_t<WorkspaceType_>;
+};
+
+template <typename PrecType_, typename WorkspaceType_>
+struct GemmConfigTwoStage_Wmma : public GemmConfigComputeV3_WMMA<PrecType_>
+{
+    using WorkspaceType = ck_tile::remove_cvref_t<WorkspaceType_>;
+};
+
+struct SplitKTwoStageInvoker
+{
+    template <typename GemmConfig,
+              typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename ELayout,
+              bool Persistent,
+              typename CDEElementWise>
+    static float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+
+    {
+        using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+            ck_tile::
+                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+            GemmConfig::PermuteA,
+            GemmConfig::PermuteB>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       GemmConfig::TileParitionerGroupNum,
+                                                       GemmConfig::TileParitionerM01>;
+
+        using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                               GemmConfig::kPadN,
+                                               GemmConfig::kPadK,
+                                               ALayout,
+                                               BLayout,
+                                               ELayout,
+                                               GemmConfig::NumWaveGroups>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                             GemmConfig::kPadN,
+                                             GemmConfig::kPadK,
+                                             GemmConfig::DoubleSmemBuffer,
+                                             ALayout,
+                                             BLayout,
+                                             ELayout,
+                                             GemmConfig::TransposeC,
+                                             GemmConfig::UseStructuredSparsity,
+                                             Persistent,
+                                             GemmConfig::NumWaveGroups,
+                                             GemmConfig::Preshuffle>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+        float ave_time{0};
+
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GemmConfig::Scheduler;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = typename PipelineTypeTraits<
+                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+            using WorkspaceType = ck_tile::remove_cvref_t<typename GemmConfig::WorkspaceType>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 WorkspaceType,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 CDEElementWise,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GemmConfig::M_Warp,
+                                                 GemmConfig::N_Warp,
+                                                 GemmConfig::M_Warp_Tile,
+                                                 GemmConfig::N_Warp_Tile,
+                                                 GemmConfig::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation,
+                                                 GemmConfig::NumWaveGroups>>;
+
+            using GemmKernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+
+            ck_tile::DeviceMem ws_m_n_dev_buf(args.M * args.N * sizeof(WorkspaceType));
+            ck_tile::GemmHostArgs ws_args = ck_tile::GemmHostArgs(args);
+            auto c_ptr                    = ws_args.c_ptr;
+            ws_args.c_ptr                 = ws_m_n_dev_buf.GetDeviceBuffer();
+            auto gemm_kargs               = GemmKernel::MakeKernelArgs(ws_args);
+
+            const dim3 grids  = Persistent ? GemmKernel::MaxOccupancyGridSize(s)
+                                           : GemmKernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = GemmKernel::BlockSize();
+
+            if(!GemmKernel::IsSupportedArgument(gemm_kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            using XElementwiseOperation = ck_tile::element_wise::UnaryConvert;
+            using BlockTile             = ck_tile::sequence<2048>;
+            using BlockWarps            = ck_tile::sequence<8>;
+            using WarpTile              = ck_tile::sequence<64>;
+
+            using ElementwiseShape =
+                ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, WorkspaceType>;
+            using Problem = ck_tile::ElementWisePipelineProblem<WorkspaceType,
+                                                                WorkspaceType,
+                                                                CDataType,
+                                                                ElementwiseShape,
+                                                                XElementwiseOperation>;
+            using ElementwiseKernel =
+                ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+            ck_tile::index_t total_elements     = 1;
+            std::vector<ck_tile::index_t> shape = {args.M, args.N};
+
+            for(auto d : shape)
+                total_elements *= d;
+
+            const ck_tile::index_t kBlockSize      = ElementwiseKernel::BlockSize();
+            constexpr ck_tile::index_t kBlockPerCu = 1;
+
+            constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+            ck_tile::index_t kGridSize =
+                (total_elements + elements_per_block - 1) / elements_per_block;
+
+            auto input_tensors = ck_tile::make_tuple(static_cast<WorkspaceType*>(ws_args.c_ptr));
+            auto input_size    = ck_tile::make_tuple(args.M, args.N);
+
+            // Check if the kernel configuration is supported
+            if(!ElementwiseKernel::IsSupportedArgument(input_size))
+            {
+                throw std::runtime_error(
+                    "Wrong! Elementwise arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << GemmKernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            // Declare rotating_mem_ptr here so it stays in scope until it is needed
+            std::unique_ptr<ck_tile::RotatingMemWrapper<ADataType, BDataType>> rotating_mem_ptr;
+            std::function<void()> preprocess;
+
+            auto clear_gemm_output = [&]() {
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        ws_args.c_ptr, 0, args.M * args.N * sizeof(WorkspaceType), s.stream_id_));
+            };
+
+            if(s.flush_cache_)
+            {
+                std::cout << "Flushing cache..." << std::endl;
+
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+                rotating_mem_ptr =
+                    std::make_unique<ck_tile::RotatingMemWrapper<ADataType, BDataType>>(
+                        gemm_kargs.as_ptr[0],
+                        gemm_kargs.bs_ptr[0],
+                        s.rotating_count_,
+                        size_a_buffer,
+                        size_b_buffer);
+                rotating_mem_ptr->Print();
+
+                preprocess = [&]() {
+                    ck_tile::flush_icache();
+                    rotating_mem_ptr->Next();
+                    clear_gemm_output();
+                };
+            }
+            else
+            {
+                preprocess = clear_gemm_output;
+            }
+
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                preprocess,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
+                    GemmKernel{}, grids, blocks, 0, gemm_kargs),
+                ck_tile::make_kernel<kBlockPerCu>(ElementwiseKernel{},
+                                                  kGridSize,
+                                                  kBlockSize,
+                                                  0,
+                                                  input_size,
+                                                  ck_tile::make_tuple(args.N, 1), // Input Stride
+                                                  ck_tile::make_tuple(args.N, 1), // Output Stride
+                                                  input_tensors,
+                                                  static_cast<CDataType*>(c_ptr)));
+
+            return ave_time;
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                Run(has_hot_loop_, tail_number_, MemoryOpSet{});
+            }
+            else
+            {
+                Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
+            }
+        };
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+        return ave_time;
+    }
+};
diff --git a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
index f42135a0b5..f200332588 100644
--- a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
@@ -343,7 +343,6 @@ float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config
     using WarpTile   = ck_tile::sequence<32, 128>;
     using ThreadTile = ck_tile::sequence<8, 8>;
 
-    constexpr ck_tile::index_t kBlockSize  = 256;
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     ck_tile::index_t kGridSize = (output_size + BlockTile::at(ck_tile::number<0>{}) - 1) /
@@ -352,7 +351,8 @@ float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config
     using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
     using Problem =
         ck_tile::Reduce2dProblem<CDataType, ComputeDataType, CDataType, Shape, ReduceOp>;
-    using Kernel = ck_tile::Reduce<Problem>;
+    using Kernel                      = ck_tile::Reduce<Problem>;
+    const ck_tile::index_t kBlockSize = Kernel::BlockSize();
 
     if(!Kernel::IsSupportedArgument(reduce_dim_size, workspace_strides))
     {
@@ -608,16 +608,11 @@ template <typename GemmConfig,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-int run_gemm_example_with_layouts_two_stage(int argc,
-                                            char* argv[],
+int run_gemm_example_with_layouts_two_stage(ck_tile::ArgParser& arg_parser,
                                             const ALayout a_layout                  = ALayout{},
                                             const BLayout b_layout                  = BLayout{},
                                             [[maybe_unused]] const CLayout c_layout = CLayout{})
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
 
     ck_tile::index_t M = arg_parser.get_int("m");
@@ -837,12 +832,13 @@ template <typename GemmConfig,
           typename APrecType,
           typename BPrecType = APrecType,
           typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
 {
-    using Row                 = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col                 = ck_tile::tensor_layout::gemm::ColumnMajor;
-    auto [result, arg_parser] = create_args(argc, argv);
-    bool preshuffle           = GemmConfig::Preshuffle;
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
 
     if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
     {
@@ -866,7 +862,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
                                                            CPrecType,
                                                            Row,
                                                            Col,
-                                                           Row>(argc, argv, Row{}, Col{}, Row{});
+                                                           Row>(arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
@@ -876,7 +872,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
                                                            CPrecType,
                                                            Col,
                                                            Col,
-                                                           Row>(argc, argv, Col{}, Col{}, Row{});
+                                                           Row>(arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -892,7 +888,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
                                                            APrecType,
                                                            BPrecType,
                                                            CPrecType>(
-                argc, argv, Row{}, Row{}, Row{});
+                arg_parser, Row{}, Row{}, Row{});
         }
         if(a_layout == "R" && b_layout == "C")
         {
@@ -900,7 +896,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
                                                            APrecType,
                                                            BPrecType,
                                                            CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+                arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "R")
         {
@@ -908,7 +904,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
                                                            APrecType,
                                                            BPrecType,
                                                            CPrecType>(
-                argc, argv, Col{}, Row{}, Row{});
+                arg_parser, Col{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
@@ -916,7 +912,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
                                                            APrecType,
                                                            BPrecType,
                                                            CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+                arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -927,12 +923,8 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
 }
 
 template <template <typename PreType> typename GemmConfig>
-int run_gemm_example(int argc, char* argv[])
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
@@ -940,33 +932,33 @@ int run_gemm_example(int argc, char* argv[])
     if(data_type == "fp16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "fp8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                           ck_tile::fp8_t,
                                           ck_tile::fp8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                           ck_tile::bf8_t,
                                           ck_tile::bf8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "int8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
                                           ck_tile::int8_t,
                                           ck_tile::int8_t,
-                                          ck_tile::int32_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::int32_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "pk_int4_t")
     {
@@ -976,7 +968,7 @@ int run_gemm_example(int argc, char* argv[])
             return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>,
                                               ck_tile::half_t,
                                               ck_tile::pk_int4_t,
-                                              ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                              ck_tile::half_t>(a_layout, b_layout, arg_parser);
         }
         else
         {
@@ -992,9 +984,19 @@ int run_gemm_example(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
+    if(!result)
+        return -1;
+
     try
     {
-        return !run_gemm_example<GemmConfigComputeV3>(argc, argv);
+#if CK_TILE_USE_WMMA
+        return !run_gemm_example<GemmConfigComputeV3_WMMA>(arg_parser);
+#else
+        return !run_gemm_example<GemmConfigComputeV3>(arg_parser);
+#endif
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 10e2b9abc1..001d1b6a00 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -4,11 +4,13 @@
 #pragma once
 
 #include <string>
+#include <variant>
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 
 #define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
@@ -70,6 +72,7 @@ struct GemmConfigBase
     static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
     static constexpr ck_tile::index_t NumWaveGroups = 1;
     static constexpr bool Preshuffle                = false;
+    static constexpr bool TiledMMAPermuteN          = false;
 
     static constexpr bool SkipALds = false;
     static constexpr bool SkipBLds = false;
@@ -178,7 +181,6 @@ struct GemmConfigComputeV3_2 : public GemmConfigBase
     static constexpr int kBlockPerCu = 2;
 };
 
-#if CK_TILE_USE_WMMA
 template <typename PrecType>
 struct GemmConfigComputeV3_WMMA : public GemmConfigBase
 {
@@ -199,7 +201,6 @@ struct GemmConfigComputeV3_WMMA : public GemmConfigBase
 
     static constexpr int kBlockPerCu = 2;
 };
-#endif
 
 template <typename PrecType>
 struct GemmConfigComputeV4 : public GemmConfigBase
@@ -281,6 +282,8 @@ struct GemmConfigPreshuffleDecode : public GemmConfigBase
     static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
     static constexpr bool Preshuffle           = true;
     static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr int N_Repeat              = N_Tile / N_Warp_Tile / N_Warp;
+    static constexpr bool TiledMMAPermuteN     = N_Repeat % 2 == 0;
 };
 
 template <typename PrecType>
@@ -303,6 +306,16 @@ struct GemmConfigPreshufflePrefill : public GemmConfigBase
     static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
     static constexpr bool Preshuffle           = true;
     static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr int N_Repeat              = N_Tile / N_Warp_Tile / N_Warp;
+    static constexpr bool TiledMMAPermuteN     = N_Repeat % 2 == 0;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufflePrefill_Wmma : public GemmConfigPreshufflePrefill<PrecType>
+{
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
 };
 
 template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
@@ -345,6 +358,24 @@ struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
     using CDataType   = ck_tile::half_t;
 };
 
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
 template <>
 struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
 {
@@ -479,7 +510,7 @@ struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE_V2>
         ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
 };
 
-auto create_args(int argc, char* argv[])
+auto create_args()
 {
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("m", "3840", "m dimension")
@@ -499,11 +530,11 @@ auto create_args(int argc, char* argv[])
         .insert("split_k", "1", "splitK value")
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
         .insert("persistent", "0", "0:non-persistent, 1:persistent")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "gemm.json", "json file name to dump results")
         .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
         .insert("rotating_count", "1000", "rotating count, defaults to 1000");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
+    return arg_parser;
 }
 
 // Type aliases for memory operation integral constants
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
index 0018db2c99..0f323cb0e3 100644
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -12,196 +12,7 @@
 #include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
 #include "run_gemm_example.inc"
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename ELayout,
-          bool Persistent,
-          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
-
-{
-    using GemmShape = ck_tile::TileGemmShape<
-        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-        ck_tile::
-            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-        GemmConfig::PermuteA,
-        GemmConfig::PermuteB>;
-
-    using TilePartitioner =
-        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
-                                                   GemmConfig::TileParitionerGroupNum,
-                                                   GemmConfig::TileParitionerM01>;
-
-    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
-                                           GemmConfig::kPadN,
-                                           GemmConfig::kPadK,
-                                           ALayout,
-                                           BLayout,
-                                           ELayout,
-                                           GemmConfig::NumWaveGroups>;
-
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
-                                                                 GemmConfig::kPadN,
-                                                                 GemmConfig::kPadK,
-                                                                 GemmConfig::DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 ELayout,
-                                                                 GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity,
-                                                                 Persistent,
-                                                                 GemmConfig::NumWaveGroups,
-                                                                 GemmConfig::Preshuffle>;
-    using GemmPipelineProblem =
-        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
-
-    using BaseGemmPipeline = typename PipelineTypeTraits<
-        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
-
-    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
-    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-    float ave_time{0};
-
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_,
-                         const auto memory_operation_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
-        constexpr auto scheduler        = GemmConfig::Scheduler;
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
-
-        using GemmPipeline = typename PipelineTypeTraits<
-            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
-
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             CDataType,
-                                             DsLayout,
-                                             ELayout,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             GemmConfig::M_Warp,
-                                             GemmConfig::N_Warp,
-                                             GemmConfig::M_Warp_Tile,
-                                             GemmConfig::N_Warp_Tile,
-                                             GemmConfig::K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC,
-                                             memory_operation,
-                                             GemmConfig::NumWaveGroups>>;
-        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        dim3 grids;
-        if constexpr(Persistent)
-        {
-            grids = Kernel::MaxOccupancyGridSize(s);
-        }
-        else
-        {
-            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-        }
-        dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << GemmShape::GetName() << '\n'
-                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                      << "pipeline: " << GemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << ", kBlockPerCu: {" << GemmConfig::kBlockPerCu << "}" << std::endl;
-        }
-        if(s.flush_cache_)
-        {
-            std::cout << "Flushing cache..." << std::endl;
-
-            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
-
-            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
-            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
-
-            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
-            rotating_mem.Print();
-
-            auto run_flush_cache = [&]() {
-                // flush icache
-                ck_tile::flush_icache();
-                // rotating mem
-                rotating_mem.Next();
-                // clear c mem
-                if(args.k_batch > 1)
-                    hipGetErrorString(hipMemsetAsync(
-                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-            };
-            ave_time = ck_tile::launch_kernel_time_mask(
-                s,
-                run_flush_cache,
-                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        else
-        {
-            ave_time = ck_tile::launch_kernel(
-                s,
-                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        return ave_time;
-    };
-
-    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
-        if(args.k_batch == 1)
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::set>{});
-        }
-        else
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::atomic_add>{});
-        }
-    };
-
-    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
-    return ave_time;
-}
+#include "gemm_weight_preshuffle_invoker.hpp"
 
 template <typename GemmConfig,
           typename APrecType,
@@ -214,6 +25,7 @@ int run_gemm_example_prec_type(std::string a_layout,
     using Row       = ck_tile::tensor_layout::gemm::RowMajor;
     using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
     bool preshuffle = GemmConfig::Preshuffle;
+    using Invoker   = WeightPreshuffleInvoker;
 
     if(preshuffle && (a_layout != "R" || b_layout != "C"))
     {
@@ -223,7 +35,7 @@ int run_gemm_example_prec_type(std::string a_layout,
 
     if(a_layout == "R" && b_layout == "C")
     {
-        return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+        return run_gemm_example_with_layouts<GemmConfig, Invoker, APrecType, BPrecType, CPrecType>(
             arg_parser, Row{}, Col{}, Row{});
     }
     else
@@ -271,13 +83,19 @@ int run_gemm_example(ck_tile::ArgParser& arg_parser)
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
     if(!result)
         return -1;
 
     try
     {
-        return !run_gemm_example<GemmConfigPreshuffleDecode>(arg_parser);
+#if CK_TILE_USE_WMMA
+        return !run_gemm_example<GemmConfigPreshufflePrefill_Wmma>(arg_parser);
+#else
+        return !run_gemm_example<GemmConfigPreshufflePrefill>(arg_parser);
+#endif
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle_invoker.hpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle_invoker.hpp
new file mode 100644
index 0000000000..d737a0f864
--- /dev/null
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle_invoker.hpp
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "gemm_utils.hpp"
+
+struct WeightPreshuffleInvoker
+{
+    template <typename GemmConfig,
+              typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename ELayout,
+              bool Persistent,
+              typename CDEElementWise>
+    static float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+
+    {
+        using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+            ck_tile::
+                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+            GemmConfig::PermuteA,
+            GemmConfig::PermuteB>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       GemmConfig::TileParitionerGroupNum,
+                                                       GemmConfig::TileParitionerM01>;
+
+        using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                               GemmConfig::kPadN,
+                                               GemmConfig::kPadK,
+                                               ALayout,
+                                               BLayout,
+                                               ELayout,
+                                               GemmConfig::NumWaveGroups>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                             GemmConfig::kPadN,
+                                             GemmConfig::kPadK,
+                                             GemmConfig::DoubleSmemBuffer,
+                                             ALayout,
+                                             BLayout,
+                                             ELayout,
+                                             GemmConfig::TransposeC,
+                                             GemmConfig::UseStructuredSparsity,
+                                             Persistent,
+                                             GemmConfig::NumWaveGroups,
+                                             GemmConfig::Preshuffle>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+        float ave_time{0};
+
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GemmConfig::Scheduler;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = typename PipelineTypeTraits<
+                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 CDEElementWise,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GemmConfig::M_Warp,
+                                                 GemmConfig::N_Warp,
+                                                 GemmConfig::M_Warp_Tile,
+                                                 GemmConfig::N_Warp_Tile,
+                                                 GemmConfig::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation,
+                                                 GemmConfig::NumWaveGroups,
+                                                 false,
+                                                 1,
+                                                 GemmConfig::TiledMMAPermuteN>>;
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            dim3 grids;
+            if constexpr(Persistent)
+            {
+                grids = Kernel::MaxOccupancyGridSize(s);
+            }
+            else
+            {
+                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+            }
+            dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << ", kBlockPerCu: {" << GemmConfig::kBlockPerCu << "}"
+                          << std::endl;
+            }
+            if(s.flush_cache_)
+            {
+                std::cout << "Flushing cache..." << std::endl;
+
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(kargs.as_ptr[0],
+                                                                               kargs.bs_ptr[0],
+                                                                               s.rotating_count_,
+                                                                               size_a_buffer,
+                                                                               size_b_buffer);
+                rotating_mem.Print();
+
+                auto run_flush_cache = [&]() {
+                    // flush icache
+                    ck_tile::flush_icache();
+                    // rotating mem
+                    rotating_mem.Next();
+                    // clear c mem
+                    if(args.k_batch > 1)
+                        hipGetErrorString(hipMemsetAsync(
+                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+                };
+                ave_time =
+                    ck_tile::launch_kernel_time_mask(s,
+                                                     run_flush_cache,
+                                                     ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
+                                                         Kernel{}, grids, blocks, 0, kargs));
+            }
+            else
+            {
+                ave_time = ck_tile::launch_kernel(s,
+                                                  ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
+                                                      Kernel{}, grids, blocks, 0, kargs));
+            }
+            return ave_time;
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::set>{});
+            }
+            else
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::atomic_add>{});
+            }
+        };
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+        return ave_time;
+    }
+};
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 7a9bfc9e13..5c8b16dcba 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
+#include "ck_tile/host/permute_pk_int4.hpp"
 
 template <typename Layout>
 static constexpr inline auto is_row_major(Layout layout_)
@@ -93,76 +94,8 @@ void permute_tensor_b(Tensor& tensor)
     }
 }
 
-template <typename Tensor>
-void permute_vectors_i4x4_b(Tensor& tensor)
-{
-    const ck_tile::index_t K = tensor.get_length(0);
-    const ck_tile::index_t N = tensor.get_length(1);
-    // vector pk_i4x4 permute
-    for(int i = 0; i < N; i++)
-    {
-        for(int j = 0; j < K; j += 8)
-        {
-            int8_t input[8];
-
-            for(int k = 0; k < 4; k++)
-            {
-                int8_t i4x2      = tensor(j + k * 2, i).data;
-                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
-                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
-            }
-
-            // permute 01234567->20643175
-            {
-                int8_t hi   = input[2];
-                int8_t lo   = input[0];
-                int8_t i4x2 = (hi << 4) | lo;
-
-                tensor(j + 0, i) = i4x2;
-            }
-
-            {
-                int8_t hi   = input[6];
-                int8_t lo   = input[4];
-                int8_t i4x2 = (hi << 4) | lo;
-
-                tensor(j + 2, i) = i4x2;
-            }
-
-            {
-                int8_t hi   = input[3];
-                int8_t lo   = input[1];
-                int8_t i4x2 = (hi << 4) | lo;
-
-                tensor(j + 4, i) = i4x2;
-            }
-
-            {
-                int8_t hi   = input[7];
-                int8_t lo   = input[5];
-                int8_t i4x2 = (hi << 4) | lo;
-
-                tensor(j + 6, i) = i4x2;
-            }
-        }
-    }
-}
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          bool Persistent,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
-
 template <typename GemmConfig,
+          typename Invoker,
           typename ADataType,
           typename BDataType,
           typename DsDataType,
@@ -203,58 +136,41 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     float ave_time;
     if(persistent)
     {
-        ave_time = gemm<GemmConfig,
-                        ADataType,
-                        BDataType,
-                        DsDataType,
-                        AccDataType,
-                        CDataType,
-                        ALayout,
-                        BLayout,
-                        DsLayout,
-                        CLayout,
-                        true,
-                        CDEElementWise>(
+        ave_time = Invoker::template gemm<GemmConfig,
+                                          ADataType,
+                                          BDataType,
+                                          DsDataType,
+                                          AccDataType,
+                                          CDataType,
+                                          ALayout,
+                                          BLayout,
+                                          DsLayout,
+                                          CLayout,
+                                          true,
+                                          CDEElementWise>(
             args,
             ck_tile::stream_config{
                 nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
     }
     else
     {
-        ave_time = gemm<GemmConfig,
-                        ADataType,
-                        BDataType,
-                        DsDataType,
-                        AccDataType,
-                        CDataType,
-                        ALayout,
-                        BLayout,
-                        DsLayout,
-                        CLayout,
-                        false,
-                        CDEElementWise>(
+        ave_time = Invoker::template gemm<GemmConfig,
+                                          ADataType,
+                                          BDataType,
+                                          DsDataType,
+                                          AccDataType,
+                                          CDataType,
+                                          ALayout,
+                                          BLayout,
+                                          DsLayout,
+                                          CLayout,
+                                          false,
+                                          CDEElementWise>(
             args,
             ck_tile::stream_config{
                 nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
     }
 
-    std::size_t flop = std::size_t(2) * M * N * K;
-    std::size_t num_byte =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "Run Gemm kernel with \n M=" << M << " N=" << N << " K=" << K
-              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
-              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
-              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
-              << " B_Type=" << DataTypeTraits<BDataType>::name
-              << " C_Type=" << DataTypeTraits<CDataType>::name
-              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
-              << " Persistent=" << (persistent ? "on" : "off") << " : \n"
-              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << std::endl;
-
     return ave_time;
 }
 
@@ -262,16 +178,63 @@ template <typename GemmConfig, typename T>
 auto shuffle_b(const ck_tile::HostTensor<T>& t)
 {
     assert(t.get_lengths().size() == 2);
+    int n_ = t.get_lengths()[1];
+    int k_ = t.get_lengths()[0];
+
+    if(ck_tile::is_gfx12_supported())
+    {
+        constexpr int divisor      = 2;
+        constexpr int kABK1PerLane = 8;
+        constexpr int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       kABK0PerLane,
+                                       divisor,
+                                       kABK1PerLane});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
+    }
+    else
+    {
+        int divisor = 1;
+        if(ck_tile::is_gfx11_supported())
+        {
+            divisor = 1;
+        }
+        else
+        {
+            assert(is_wave32() == false);
+            divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+        }
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       divisor,
+                                       GemmConfig::K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+}
+
+template <typename GemmConfig, typename T>
+auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t)
+{
+    assert(t.get_lengths().size() == 2);
+
     int n_                = t.get_lengths()[1];
     int k_                = t.get_lengths()[0];
     constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
-    ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+    constexpr int NRepeat = GemmConfig::N_Tile / GemmConfig::N_Warp_Tile / GemmConfig::N_Warp;
+    ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Tile,
+                                   GemmConfig::N_Warp,
                                    GemmConfig::N_Warp_Tile,
+                                   NRepeat,
                                    k_ / GemmConfig::K_Warp_Tile,
                                    divisor,
                                    GemmConfig::K_Warp_Tile / divisor});
     std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
 }
 
 template <typename CDataType>
@@ -294,6 +257,7 @@ bool do_verify(const ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
 }
 
 template <typename GemmConfig,
+          typename Invoker,
           typename ADataType,
           typename BDataType = ADataType,
           typename CDataType = ADataType,
@@ -378,7 +342,18 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
 
     if constexpr(preshuffle)
     {
-        ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n);
+        ck_tile::HostTensor<BDataType> b_shuffle_host = [&]() {
+            if constexpr(GemmConfig::TiledMMAPermuteN)
+            {
+                std::cout << "Run with PermuteN" << std::endl;
+                return shuffle_b_permuteN<GemmConfig>(b_k_n);
+            }
+            else
+            {
+                std::cout << "Run without PermuteN" << std::endl;
+                return shuffle_b<GemmConfig>(b_k_n);
+            }
+        }();
         // shuffled buffer B for device implementation
         b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
     }
@@ -400,7 +375,7 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
                                  BLayout,
                                  CLayout>(b_k_n_dev);
             }
-            permute_vectors_i4x4_b(b_k_n_dev);
+            ck_tile::permute_vectors_i4x4_b(b_k_n_dev);
             b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
         }
         else
@@ -418,32 +393,50 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
     c_m_n_dev_buf.SetZero();
     c_m_n_dev_result.SetZero();
 
-    invoke_gemm<GemmConfig,
-                ADataType,
-                BDataType,
-                ck_tile::tuple<>,
-                AccDataType,
-                CDataType,
-                ALayout,
-                BLayout,
-                ck_tile::tuple<>,
-                CLayout>(a_m_k_dev_buf,
-                         b_k_n_dev_buf,
-                         c_m_n_dev_buf,
-                         M,
-                         N,
-                         K,
-                         stride_A,
-                         stride_B,
-                         stride_C,
-                         kbatch,
-                         n_warmup,
-                         n_repeat,
-                         persistent,
-                         flush_cache,
-                         rotating_count);
+    float ave_time = invoke_gemm<GemmConfig,
+                                 Invoker,
+                                 ADataType,
+                                 BDataType,
+                                 ck_tile::tuple<>,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 ck_tile::tuple<>,
+                                 CLayout>(a_m_k_dev_buf,
+                                          b_k_n_dev_buf,
+                                          c_m_n_dev_buf,
+                                          M,
+                                          N,
+                                          K,
+                                          stride_A,
+                                          stride_B,
+                                          stride_C,
+                                          kbatch,
+                                          n_warmup,
+                                          n_repeat,
+                                          persistent,
+                                          flush_cache,
+                                          rotating_count);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
+              << " B_Type=" << DataTypeTraits<BDataType>::name
+              << " C_Type=" << DataTypeTraits<CDataType>::name
+              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
+              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+
     bool pass = true;
 
     // memory on host to store gpu reference result
@@ -498,5 +491,28 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
         pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "GPU");
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_gemm_json_results<ALayout,
+                               BLayout,
+                               CLayout,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               GemmConfig,
+                               DataTypeTraits>(arg_parser.get_str("jsonfile"),
+                                               M,
+                                               N,
+                                               K,
+                                               stride_A,
+                                               stride_B,
+                                               stride_C,
+                                               persistent,
+                                               pass,
+                                               ave_time,
+                                               tflops,
+                                               gb_per_sec);
+    }
+
     return pass;
 }
diff --git a/example/ck_tile/03_gemm/run_gemm_example_common.hpp b/example/ck_tile/03_gemm/run_gemm_example_common.hpp
new file mode 100644
index 0000000000..3c53f2b2e8
--- /dev/null
+++ b/example/ck_tile/03_gemm/run_gemm_example_common.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "gemm_utils.hpp"
+
+template <typename GemmConfig,
+          typename Invoker,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
+{
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
+
+    if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        throw std::runtime_error("Preshuffle is not supported for this int4 datatype!");
+    }
+
+    if(preshuffle && a_layout != "R" && b_layout != "C")
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+
+    using LayoutVariant = std::variant<Row, Col>;
+
+    auto string_to_layout = [](const std::string& layout) -> LayoutVariant {
+        if(layout == "R")
+            return Row{};
+        if(layout == "C")
+            return Col{};
+        throw std::runtime_error("Unsupported layout: " + layout);
+    };
+
+    auto a_layout_variant = string_to_layout(a_layout);
+    auto b_layout_variant = string_to_layout(b_layout);
+
+    return std::visit(
+        [&](auto a_layout_type, auto b_layout_type) -> int {
+            if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t> &&
+                         std::is_same_v<decltype(b_layout_type), Row>)
+            {
+                throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                         "BPrecType is ck_tile::pk_int4_t!");
+            }
+            else
+            {
+                return run_gemm_example_with_layouts<GemmConfig,
+                                                     Invoker,
+                                                     APrecType,
+                                                     BPrecType,
+                                                     CPrecType>(
+                    arg_parser, a_layout_type, b_layout_type, Row{});
+            }
+        },
+        a_layout_variant,
+        b_layout_variant);
+}
diff --git a/example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh b/example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh
index 951f8aa63a..c2ee7a1c3e 100755
--- a/example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh
+++ b/example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh
@@ -5,7 +5,7 @@ KNAME=1
 export CK_WARMUP=0
 export CK_REPEAT=1
 
-COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
 
 run_tests() {
     for m in 512 1024; do
@@ -32,5 +32,8 @@ run_tests "fp16"
 run_tests "bf16"
 run_tests "fp8"
 run_tests "bf8"
+run_tests "fp16i4"
+run_tests "fp8i4"
+run_tests "bf8i4"
 
 set +x
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 8243d5c5c7..1bedd9b72d 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -5,292 +5,37 @@
 
 #include <cstring>
 #include <iostream>
-#include <sstream>
 #include <string>
-#include <tuple>
 
-#include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
 #include "run_gemm_example.inc"
+#include "run_gemm_example_common.hpp"
+#include "universal_gemm_invoker.hpp"
 
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename ELayout,
-          bool Persistent,
-          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
-
-{
-    using GemmShape = ck_tile::TileGemmShape<
-        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-        ck_tile::
-            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-        GemmConfig::PermuteA,
-        GemmConfig::PermuteB>;
-
-    using TilePartitioner =
-        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
-                                                   GemmConfig::TileParitionerGroupNum,
-                                                   GemmConfig::TileParitionerM01>;
-
-    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
-                                           GemmConfig::kPadN,
-                                           GemmConfig::kPadK,
-                                           ALayout,
-                                           BLayout,
-                                           ELayout,
-                                           GemmConfig::NumWaveGroups>;
-
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
-                                                                 GemmConfig::kPadN,
-                                                                 GemmConfig::kPadK,
-                                                                 GemmConfig::DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 ELayout,
-                                                                 GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity,
-                                                                 Persistent,
-                                                                 GemmConfig::NumWaveGroups,
-                                                                 GemmConfig::Preshuffle,
-                                                                 GemmConfig::SkipALds,
-                                                                 GemmConfig::SkipBLds>;
-    using GemmPipelineProblem =
-        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
-
-    using BaseGemmPipeline = typename PipelineTypeTraits<
-        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
-
-    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
-    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-    float ave_time{0};
-
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_,
-                         const auto memory_operation_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
-        constexpr auto scheduler        = GemmConfig::Scheduler;
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
-
-        using GemmPipeline = typename PipelineTypeTraits<
-            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
-
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             CDataType,
-                                             DsLayout,
-                                             ELayout,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             GemmConfig::M_Warp,
-                                             GemmConfig::N_Warp,
-                                             GemmConfig::M_Warp_Tile,
-                                             GemmConfig::N_Warp_Tile,
-                                             GemmConfig::K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC,
-                                             memory_operation,
-                                             GemmConfig::NumWaveGroups>>;
-
-        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        dim3 grids;
-        if constexpr(Persistent)
-        {
-            grids = Kernel::MaxOccupancyGridSize(s);
-        }
-        else
-        {
-            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-        }
-        dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << GemmShape::GetName() << '\n'
-                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                      << "pipeline: " << GemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-        if(s.flush_cache_)
-        {
-            std::cout << "Flushing cache..." << std::endl;
-
-            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
-
-            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
-            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
-
-            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
-            rotating_mem.Print();
-
-            auto run_flush_cache = [&]() {
-                // flush icache
-                ck_tile::flush_icache();
-                // rotating mem
-                rotating_mem.Next();
-                // clear c mem
-                if(args.k_batch > 1)
-                    hipGetErrorString(hipMemsetAsync(
-                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-            };
-            ave_time = ck_tile::launch_kernel_time_mask(
-                s,
-                run_flush_cache,
-                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        else
-        {
-            ave_time = ck_tile::launch_kernel(
-                s,
-                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        return ave_time;
-    };
-
-    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
-        if(args.k_batch == 1)
-        {
-            Run(has_hot_loop_, tail_number_, MemoryOpSet{});
-        }
-        else
-        {
-            Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
-        }
-    };
-
-    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
-    return ave_time;
-}
-
-template <typename GemmConfig,
-          typename APrecType,
-          typename BPrecType = APrecType,
-          typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout,
-                               std::string b_layout,
-                               ck_tile::ArgParser& arg_parser)
-{
-    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
-    bool preshuffle = GemmConfig::Preshuffle;
-
-    if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
-    {
-        throw std::runtime_error("Preshuffle is not supported for this int4 datatype!");
-    }
-
-    if(preshuffle && a_layout != "R" && b_layout != "C")
-    {
-        throw std::runtime_error(
-            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
-    }
-
-    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
-    {
-        if(a_layout == "R" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                arg_parser, Row{}, Col{}, Row{});
-        }
-        else if(a_layout == "C" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                arg_parser, Col{}, Col{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported memory layout for the input matrices when "
-                                     "BPrecType is ck_tile::pk_int4_t!");
-        }
-    }
-    else
-    {
-        if(a_layout == "R" && b_layout == "R")
-        {
-            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                arg_parser, Row{}, Row{}, Row{});
-        }
-        else if(a_layout == "R" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                arg_parser, Row{}, Col{}, Row{});
-        }
-        else if(a_layout == "C" && b_layout == "R")
-        {
-            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                arg_parser, Col{}, Row{}, Row{});
-        }
-        else if(a_layout == "C" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                arg_parser, Col{}, Col{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported memory layout for the input matrices!");
-        }
-    }
-}
-
-template <template <typename PreType> typename GemmConfig>
+template <template <typename PrecType> typename GemmConfig>
 int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
 
+    using Invoker = UniversalInvoker;
+
     if(data_type == "fp16")
     {
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, Invoker, ck_tile::half_t>(
             a_layout, b_layout, arg_parser);
     }
 
     else if(data_type == "bf16")
     {
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf16_t>, Invoker, ck_tile::bf16_t>(
             a_layout, b_layout, arg_parser);
     }
     else if(data_type == "fp8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          Invoker,
                                           ck_tile::fp8_t,
                                           ck_tile::fp8_t,
                                           ck_tile::half_t>(a_layout, b_layout, arg_parser);
@@ -298,6 +43,7 @@ int run_gemm_example(ck_tile::ArgParser& arg_parser)
     else if(data_type == "bf8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          Invoker,
                                           ck_tile::bf8_t,
                                           ck_tile::bf8_t,
                                           ck_tile::half_t>(a_layout, b_layout, arg_parser);
@@ -305,16 +51,18 @@ int run_gemm_example(ck_tile::ArgParser& arg_parser)
     else if(data_type == "int8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
+                                          Invoker,
                                           ck_tile::int8_t,
                                           ck_tile::int8_t,
                                           ck_tile::int32_t>(a_layout, b_layout, arg_parser);
     }
-    else if(data_type == "pk_int4_t")
+    else if(data_type == "fp16i4")
     {
         // TODO: Add support for bhalf_t ADataType
         if constexpr(GemmConfig<ck_tile::half_t>::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>,
+                                              Invoker,
                                               ck_tile::half_t,
                                               ck_tile::pk_int4_t,
                                               ck_tile::half_t>(a_layout, b_layout, arg_parser);
@@ -324,6 +72,36 @@ int run_gemm_example(ck_tile::ArgParser& arg_parser)
             throw std::runtime_error("Unsupported pipeline for this operation !!!");
         }
     }
+    else if(data_type == "fp8i4")
+    {
+        if constexpr(GemmConfig<ck_tile::fp8_t>::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                              Invoker,
+                                              ck_tile::fp8_t,
+                                              ck_tile::pk_int4_t,
+                                              ck_tile::half_t>(a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
+    else if(data_type == "bf8i4")
+    {
+        if constexpr(GemmConfig<ck_tile::bf8_t>::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                              Invoker,
+                                              ck_tile::bf8_t,
+                                              ck_tile::pk_int4_t,
+                                              ck_tile::half_t>(a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
     else
     {
         throw std::runtime_error("Unsupported data type for this operation !!!");
@@ -332,7 +110,9 @@ int run_gemm_example(ck_tile::ArgParser& arg_parser)
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
     if(!result)
         return -1;
 
diff --git a/example/ck_tile/03_gemm/universal_gemm_invoker.hpp b/example/ck_tile/03_gemm/universal_gemm_invoker.hpp
new file mode 100644
index 0000000000..733b15ae20
--- /dev/null
+++ b/example/ck_tile/03_gemm/universal_gemm_invoker.hpp
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <functional>
+#include "gemm_utils.hpp"
+
+struct UniversalInvoker
+{
+    template <typename GemmConfig,
+              typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename ELayout,
+              bool Persistent,
+              typename CDEElementWise>
+    static float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+
+    {
+        using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+            ck_tile::
+                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+            GemmConfig::PermuteA,
+            GemmConfig::PermuteB>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       GemmConfig::TileParitionerGroupNum,
+                                                       GemmConfig::TileParitionerM01>;
+
+        using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                               GemmConfig::kPadN,
+                                               GemmConfig::kPadK,
+                                               ALayout,
+                                               BLayout,
+                                               ELayout,
+                                               GemmConfig::NumWaveGroups>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                             GemmConfig::kPadN,
+                                             GemmConfig::kPadK,
+                                             GemmConfig::DoubleSmemBuffer,
+                                             ALayout,
+                                             BLayout,
+                                             ELayout,
+                                             GemmConfig::TransposeC,
+                                             GemmConfig::UseStructuredSparsity,
+                                             Persistent,
+                                             GemmConfig::NumWaveGroups,
+                                             GemmConfig::Preshuffle,
+                                             GemmConfig::SkipALds,
+                                             GemmConfig::SkipBLds>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+        float ave_time{0};
+
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GemmConfig::Scheduler;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = typename PipelineTypeTraits<
+                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 CDEElementWise,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GemmConfig::M_Warp,
+                                                 GemmConfig::N_Warp,
+                                                 GemmConfig::M_Warp_Tile,
+                                                 GemmConfig::N_Warp_Tile,
+                                                 GemmConfig::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation,
+                                                 GemmConfig::NumWaveGroups>>;
+
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Persistent ? Kernel::MaxOccupancyGridSize(s)
+                                           : Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            // Declare rotating_mem_ptr here so it stays in scope until it is needed
+            std::unique_ptr<ck_tile::RotatingMemWrapper<ADataType, BDataType>> rotating_mem_ptr;
+            std::function<void()> preprocess;
+
+            auto clear_gemm_output = [&]() {
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+
+            if(s.flush_cache_)
+            {
+                std::cout << "Flushing cache..." << std::endl;
+
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+                rotating_mem_ptr =
+                    std::make_unique<ck_tile::RotatingMemWrapper<ADataType, BDataType>>(
+                        kargs.as_ptr[0],
+                        kargs.bs_ptr[0],
+                        s.rotating_count_,
+                        size_a_buffer,
+                        size_b_buffer);
+                rotating_mem_ptr->Print();
+
+                preprocess = [&]() {
+                    ck_tile::flush_icache();
+                    rotating_mem_ptr->Next();
+                    clear_gemm_output();
+                };
+            }
+            else
+            {
+                preprocess = clear_gemm_output;
+            }
+
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                preprocess,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                Run(has_hot_loop_, tail_number_, MemoryOpSet{});
+            }
+            else
+            {
+                Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
+            }
+        };
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+        return ave_time;
+    }
+};
diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp
index a110c2f98d..297ff03992 100644
--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -3,8 +3,24 @@
 
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 #include <cstring>
 
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
@@ -14,8 +30,10 @@ auto create_args(int argc, char* argv[])
         .insert("c", "512", "c dimension")
         .insert("v", "1", "cpu validation or not")
         .insert("prec", "fp16", "precision")
-        .insert("warmup", "0", "cold iter")
-        .insert("repeat", "1", "hot iter");
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "reduce.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -70,7 +88,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
     // using WarpTile   = ck_tile::sequence<1, 512>;
     // using Vector = ck_tile::sequence<1, 8>;
 
-    constexpr ck_tile::index_t kBlockSize  = 256;
     constexpr ck_tile::index_t kBlockPerCu = 1;
     ck_tile::index_t kept_dim_len_prod     = N * C;
     ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
@@ -81,8 +98,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using Porblem =
         ck_tile::Reduce2dProblem<XDataType, ComputeDataType, YDataType, Shape, ReduceOp>;
 
-    using Kernel = ck_tile::Reduce<Porblem>;
-
+    using Kernel                      = ck_tile::Reduce<Porblem>;
+    const ck_tile::index_t kBlockSize = Kernel::BlockSize();
     // Create input tensor shape and strides
     auto input_shape =
         ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
@@ -126,6 +143,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
         std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_reduce_json_results<DataType, DataTypeTraits>(
+            arg_parser.get_str("jsonfile"), N, C, H, W, pass, ave_time, 0, gb_per_sec);
+    }
+
     return pass;
 }
 
diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
index d486196fc3..aa9fd97171 100644
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -88,10 +88,9 @@ struct matrix_core_swizzle_kernel
     using karg = matrix_core_swizzle_host_args;
     using harg = matrix_core_swizzle_host_args;
 
-    static constexpr int BLOCK_SIZE      = BLOCK_SIZE_;
-    static constexpr int WavesPerBlock_N = 4;
-    static constexpr int WavesPerBlock_K = 1;
-    static_assert(WavesPerBlock_N * WavesPerBlock_K * 64 == BLOCK_SIZE);
+    static constexpr int BLOCK_SIZE                   = BLOCK_SIZE_;
+    static constexpr int WavesPerBlock_N              = BLOCK_SIZE / ck_tile::get_warp_size();
+    static constexpr int WavesPerBlock_K              = 1;
     static constexpr int NPerBlock                    = NPerBlock_;
     static constexpr int KPerBlock                    = KPerBlock_;
     static constexpr matrix_core_permute_style pstyle = pstyle_;
diff --git a/example/ck_tile/06_permute/permute.cpp b/example/ck_tile/06_permute/permute.cpp
index aafece0f25..c4c6f077d7 100644
--- a/example/ck_tile/06_permute/permute.cpp
+++ b/example/ck_tile/06_permute/permute.cpp
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "permute.hpp"
 #include "ck_tile/host.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 
 #include <array>
 #include <cstring>
@@ -127,7 +128,9 @@ auto create_args(int argc, char* argv[])
                 "random seed used for initializing input tensors. 0 for "
                 "non-deterministic seed")
         .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "permute.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -256,6 +259,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
         return permute(t, a, stream_config);
     };
+#if !CK_TILE_USE_WMMA
 #ifdef PERMUTE_USE_ALTERNATIVE_IMPL
     // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
     if((arg_parser.get_str("perm") == std::string("0,1,4,2,5,3,6") ||
@@ -344,6 +348,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         }
     }
     else
+#endif
 #endif
     {
         ave_time = run_permute();
@@ -382,6 +387,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
         std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_permute_json_results(arg_parser.get_str("jsonfile"), data_type, pass, ave_time, 0, 0);
+    }
+
     std::cout << std::endl;
 
     return pass;
diff --git a/example/ck_tile/09_topk_softmax/README.md b/example/ck_tile/09_topk_softmax/README.md
index 2e15aeaae5..8bed733d36 100644
--- a/example/ck_tile/09_topk_softmax/README.md
+++ b/example/ck_tile/09_topk_softmax/README.md
@@ -24,5 +24,7 @@ args:
        -st_o    row stride of output/indices, -1 means same as topk (default:-1)
        -seed    seed to be used, -1 means random every time (default:-1)
       -kname    when set to 1 it will print kernel name (default:0)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:topk_softmax.json)
 
 ```
diff --git a/example/ck_tile/09_topk_softmax/topk_softmax.cpp b/example/ck_tile/09_topk_softmax/topk_softmax.cpp
index 6fc25631fd..0487bd05d2 100644
--- a/example/ck_tile/09_topk_softmax/topk_softmax.cpp
+++ b/example/ck_tile/09_topk_softmax/topk_softmax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 #include <iostream>
@@ -13,6 +13,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/reduce.hpp"
 #include "topk_softmax_api.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 
 #if 0
 template <typename T>
@@ -130,7 +131,9 @@ auto create_args(int argc, char* argv[])
         .insert("seed", "-1", "seed to be used, -1 means random every time")
         .insert("kname", "0", "when set to 1 it will print kernel name")
         .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "topk_softmax.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -273,6 +276,23 @@ bool test_topk_softmax(ck_tile::ArgParser args)
     }
 
     printf("valid:%s\n", rtn ? "y" : "n");
+
+    if(args.get_int("json") == 1)
+    {
+        dump_topk_softmax_json(args.get_str("jsonfile"),
+                               input_prec,
+                               weight_prec,
+                               tokens,
+                               experts,
+                               topk,
+                               stride_input,
+                               stride_output,
+                               ms,
+                               0,
+                               0,
+                               rtn);
+    }
+
     fflush(stdout);
     return rtn;
 }
diff --git a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
index c2bad24cfe..6e6bb20020 100644
--- a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
+++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "topk_softmax_api.hpp"
 
diff --git a/example/ck_tile/10_rmsnorm2d/README.md b/example/ck_tile/10_rmsnorm2d/README.md
index 1d27ad153e..4f2bc8b5ad 100644
--- a/example/ck_tile/10_rmsnorm2d/README.md
+++ b/example/ck_tile/10_rmsnorm2d/README.md
@@ -6,17 +6,34 @@ This folder contains example for Rmsnorm2D forward using ck_tile tile-programmin
 ```
 # in the root of ck_tile
 mkdir build && cd build
-../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
-make tile_rmsnorm2d_fwd -j
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_rmsnorm2d_fwd -j`nproc`
 ```
 This will result in an executable `build/bin/tile_rmsnorm2d_fwd`
 
 ## cmdline
 ```
 args:
-          -m    m dimension (default:3328)
-          -n    m dimension (default:4096)
-          -e    epsilon (default:1e-5)
-          -v    cpu validation or not (default:1)
-       -prec    precision (default:fp16)
+           -m    m dimension (default:3328)
+           -n    n dimension (default:4096)
+    -x_stride    x row_stride, if -1 then equal to n (default:-1)
+   -xr_stride    x residule row_stride, if -1 then equal to n (default:-1)
+    -y_stride    y row_stride, if -1 then equal to n (default:-1)
+   -yr_stride    y residule row_stride, if -1 then equal to n (default:-1)
+           -e    epsilon (default:1e-5)
+    -save_rms    save rms(invrms) or not. set to 1 in training case (default:0)
+-save_unquant    save result before quant (default:0)
+           -v    cpu validation or not (default:1)
+       -kname    print kernel name or not (default:1)
+      -prec_i    input precision (default:fp16)
+      -prec_o    output precision, set auto will be the same as input (default:auto)
+     -prec_sm    output quant scale type, set auto will use fp32. used when fquant=1 (default:auto)
+     -prec_sy    output quant scale type, set auto will use fp32. used when fquant=1 or 2 (default:auto)
+        -fadd    fused-add, 0:no fused add, 1:preadd+store, 2:preadd only (default:0)
+      -fquant    fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant (default:0)
+      -warmup    cold iter (default:5)
+      -repeat    hot iter (default:20)
+           -s    sensitive model mode, 0: for no specific model, 1: for T5-like model (default:0)
+        -json    0: No Json, 1: Dump Results in Json format (default:0)
+    -jsonfile    json file name to dump results (default:rmsnorm2d_fwd.json)
 ```
diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
index 511efeeaec..2ca5157eda 100644
--- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
@@ -71,11 +71,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     constexpr bool kTwoPass = true;
 
-    using BlockWarps = ck_tile::sequence<2, 2>;
-    using BlockTile  = ck_tile::sequence<2, 128>;
-    using WarpTile   = ck_tile::sequence<1, 64>;
-    using Vector     = ck_tile::sequence<1, 1>;
-    using Shape      = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using BlockTile      = ck_tile::sequence<2, 128>;
+    using Vector         = ck_tile::sequence<1, 1>;
+    using ThreadPerBlock = ck_tile::sequence<2, 128>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     using PipelineTraits =
         ck_tile::Rmsnorm2dFwdTraits<true,  // kPadN
diff --git a/example/ck_tile/10_rmsnorm2d/generate.py b/example/ck_tile/10_rmsnorm2d/generate.py
index ea8dfdf9ce..0e948322a2 100644
--- a/example/ck_tile/10_rmsnorm2d/generate.py
+++ b/example/ck_tile/10_rmsnorm2d/generate.py
@@ -75,54 +75,17 @@ struct rmsnorm2d_fwd_traits_
     using YScaleDataType      = ck_tile::remove_cvref_t<YScaleDataType_>;
     using UnquantYDataType    = ck_tile::remove_cvref_t<UnquantYDataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
-            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
-
     using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
     using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
+    
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN        = kPadN_;
     static constexpr bool kSaveInvRms  = kSaveInvRms_;
diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
index 751b868411..6e2664e9ba 100644
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
@@ -1,6 +1,7 @@
 #include "ck_tile/host.hpp"
 #include "rmsnorm2d_fwd.hpp"
 #include <cstring>
+#include "ck_tile/utility/json_dump.hpp"
 
 // different threshold for different dtype
 template <typename DataType>
@@ -53,7 +54,9 @@ auto create_args(int argc, char* argv[])
         .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
         .insert("warmup", "5", "cold iter")
         .insert("repeat", "20", "hot iter")
-        .insert("s", "0", "sensitive model mode, 0: for no specific model, 1: for T5-like model");
+        .insert("s", "0", "sensitive model mode, 0: for no specific model, 1: for T5-like model")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "rmsnorm2d_fwd.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -437,6 +440,23 @@ bool run(const ck_tile::ArgParser& arg_parser)
         std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_rmsnorm2d_fwd_json(arg_parser.get_str("jsonfile"),
+                                prec_str,
+                                m,
+                                n,
+                                x_stride,
+                                xr_stride,
+                                y_stride,
+                                yr_stride,
+                                use_model_sensitive_rmsnorm,
+                                ave_time,
+                                0,
+                                gb_per_sec,
+                                pass);
+    }
+
     return pass;
 }
 
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md
index f9ba76c9e3..6c01655b75 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md
@@ -6,8 +6,8 @@ This folder contains example for add + Rmsnorm2D + rowwise dynamic quantization
 ```
 # in the root of ck_tile
 mkdir build && cd build
-../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
-make tile_add_rmsnorm2d_rdquant_fwd -j
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_add_rmsnorm2d_rdquant_fwd -j`nproc`
 ```
 This will result in an executable `build/bin/tile_add_rmsnorm2d_rdquant_fwd`
 
@@ -15,8 +15,16 @@ This will result in an executable `build/bin/tile_add_rmsnorm2d_rdquant_fwd`
 ```
 args:
           -m    m dimension (default:3328)
-          -n    m dimension (default:4096)
+          -n    n dimension (default:4096)
+     -stride    stride per row, if -1 then equal to n (default:-1)
           -e    epsilon (default:1e-5)
+     -save_x    save rms(invrms) or not. set to 1 in training case (default:1)
           -v    cpu validation or not (default:1)
+      -kname    print kernel name or not (default:1)
        -prec    precision (default:fp16)
+      -quant    precision (default:int8)
+     -warmup    cold iter (default:5)
+     -repeat    hot iter (default:20)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:add_rmsnorm2d_rdquant_fwd.json)
 ```
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
index 1cd375d0f5..ae14be026b 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
@@ -1,6 +1,7 @@
 #include "ck_tile/host.hpp"
 #include "add_rmsnorm2d_rdquant_fwd.hpp"
 #include <cstring>
+#include "ck_tile/utility/json_dump.hpp"
 
 // different threshold for different dtype
 template <typename InputDataType>
@@ -41,7 +42,9 @@ auto create_args(int argc, char* argv[])
         .insert("prec", "fp16", "precision")
         .insert("quant", "int8", "precision")
         .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("repeat", "20", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "add_rmsnorm2d_rdquant_fwd.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -260,6 +263,21 @@ bool run(const ck_tile::ArgParser& arg_parser)
         std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_add_rmsnorm2d_rdquant_fwd_json(arg_parser.get_str("jsonfile"),
+                                            input_data_type,
+                                            quantized_data_type,
+                                            m,
+                                            n,
+                                            stride,
+                                            epsilon,
+                                            ave_time,
+                                            0,
+                                            gb_per_sec,
+                                            pass);
+    }
+
     return pass;
 }
 
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
index faa134e5c4..b7bd7ac7df 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -80,55 +80,17 @@ struct add_rmsnorm2d_rdquant_fwd_traits_
     using InputDataType     = ck_tile::remove_cvref_t<InputDataType_>;
     using QuantizedDataType = ck_tile::remove_cvref_t<QuantizedDataType_>;
 
-    static constexpr auto WarpSize        = ck_tile::get_warp_size();
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+    using BlockTile      = ck_tile::sequence<Block_M, Block_N>;
+    using Vector         = ck_tile::sequence<1, Vector_N_>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
 
-    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
-    using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN      = kPadN_;
     static constexpr bool kSaveX     = kSaveX_;
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
index ace5fe0c4f..ca94bc1b71 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -99,12 +99,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     constexpr bool kThreePass = true;
 
-    using BlockWarps = ck_tile::sequence<4, 1>;
-    using BlockTile  = ck_tile::sequence<4, 128>;
-    using WarpTile   = ck_tile::sequence<1, 64>;
-    using Vector     = ck_tile::sequence<1, 1>;
+    using BlockTile      = ck_tile::sequence<4, 128>;
+    using Vector         = ck_tile::sequence<1, 1>;
+    using ThreadPerBlock = ck_tile::sequence<4, 64>;
 
-    using Shape   = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape   = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
     using Problem = ck_tile::AddRmsnorm2dRdquantFwdPipelineProblem<ADataType,
                                                                    BDataType,
                                                                    GammaDataType,
diff --git a/example/ck_tile/12_smoothquant/README.md b/example/ck_tile/12_smoothquant/README.md
index 6b3acd558b..98205e7350 100644
--- a/example/ck_tile/12_smoothquant/README.md
+++ b/example/ck_tile/12_smoothquant/README.md
@@ -6,8 +6,8 @@ This folder contains example for smoothquant using ck_tile tile-programming impl
 ```
 # in the root of ck_tile
 mkdir build && cd build
-../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
-make tile_smoothquant -j
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_smoothquant -j`nproc`
 ```
 This will result in an executable `build/bin/tile_smoothquant`
 
@@ -15,7 +15,14 @@ This will result in an executable `build/bin/tile_smoothquant`
 ```
 args:
           -m    m dimension (default:3328)
-          -n    m dimension (default:4096)
+          -n    n dimension (default:4096)
+   -x_stride    input stride per row, if -1 then equal to n (default:-1)
+   -y_stride    output stride per row, if -1 then equal to n (default:-1)
           -v    cpu validation or not (default:1)
+      -kname    print kernel name or not (default:1)
        -prec    precision (default:fp16)
+     -warmup    cold iter (default:5)
+     -repeat    hot iter (default:20)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:smoothquant.json)
 ```
diff --git a/example/ck_tile/12_smoothquant/example_smoothquant.cpp b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
index e688947d71..fe9f585ed7 100644
--- a/example/ck_tile/12_smoothquant/example_smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
@@ -94,12 +94,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     constexpr bool kTwoPass = true;
 
-    using BlockWarps = ck_tile::sequence<2, 2>;
-    using BlockTile  = ck_tile::sequence<2, 128>;
-    using WarpTile   = ck_tile::sequence<1, 64>;
-    using Vector     = ck_tile::sequence<1, 1>;
+    using BlockTile      = ck_tile::sequence<2, 128>;
+    using Vector         = ck_tile::sequence<1, 1>;
+    using ThreadPerBlock = ck_tile::sequence<2, 128>;
 
-    using Shape   = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape   = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
     using Problem = ck_tile::SmoothquantPipelineProblem<XDataType,
                                                         SmoothScaleDataType,
                                                         ComputeDataType,
diff --git a/example/ck_tile/12_smoothquant/smoothquant.cpp b/example/ck_tile/12_smoothquant/smoothquant.cpp
index 02ab1cd9b1..50da06f204 100644
--- a/example/ck_tile/12_smoothquant/smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.cpp
@@ -1,5 +1,6 @@
 #include "ck_tile/host.hpp"
 #include "smoothquant.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 #include <cstring>
 
 // different threshold for different dtype
@@ -39,7 +40,9 @@ auto create_args(int argc, char* argv[])
         .insert("kname", "1", "print kernel name or not")
         .insert("prec", "fp16", "precision")
         .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("repeat", "20", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "smoothquant.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -202,6 +205,19 @@ bool run(const ck_tile::ArgParser& arg_parser)
         std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_smoothquant_json(arg_parser.get_str("jsonfile"),
+                              data_type,
+                              m,
+                              n,
+                              x_stride,
+                              y_stride,
+                              ave_time,
+                              0,
+                              gb_per_sec,
+                              pass);
+    }
     return pass;
 }
 
diff --git a/example/ck_tile/12_smoothquant/smoothquant.hpp b/example/ck_tile/12_smoothquant/smoothquant.hpp
index 5f8254a664..f881c7e2a9 100644
--- a/example/ck_tile/12_smoothquant/smoothquant.hpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.hpp
@@ -49,54 +49,16 @@ struct smoothquant_traits_
 {
     using DataType = ck_tile::remove_cvref_t<DataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
-            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
-
-    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
-    using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using BlockTile      = ck_tile::sequence<Block_M, Block_N>;
+    using Vector         = ck_tile::sequence<1, Vector_N_>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
+    using Shape          = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN    = kPadN_;
     static constexpr bool kTwoPass = kTwoPass_;
diff --git a/example/ck_tile/13_moe_sorting/README.md b/example/ck_tile/13_moe_sorting/README.md
index c99f40aa57..1fd40aab35 100644
--- a/example/ck_tile/13_moe_sorting/README.md
+++ b/example/ck_tile/13_moe_sorting/README.md
@@ -6,32 +6,36 @@ This folder contains example for moe-sorting kernel using ck_tile tile-programmi
 ```
 # in the root of ck_tile
 mkdir build && cd build
-../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
-make tile_example_moe_sorting -j
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_moe_sorting -j`nproc`
 ```
 This will result in an executable `build/bin/tile_example_moe_sorting`
 
 ## example
 ```
 args:
-           -v    turn CPU validation on (1) or off (0). (default:1)
-        -pr_i    index data type.  Only int32 is currently supported. (default:int32)
-        -pr_w    output weight data type. Only fp32 is currently supported. (default:fp32)
-           -t    number of input tokens. (default:128)
-                 If "local_t" presents, this value indicates global concurrency of all ranks.
-     -local_t    Number of local input tokens for curent rank. (default:-1)
-                 This value must be within range "[0, t)", or "-1"(no such feature)
-                 This feature is to simulate EP case where where each rank has different tokens.
-                 Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.
-           -e    number of num_experts (default:8)
-           -k    topk (default:4)
-        -unit    unit_size (default:32)
--moe_buf_size    moe_buf_size (default:0)
-   -local_eid    a list of experts enabled as local expert. e.g. "0,1,4,5" (default:-1)
-                 please make sure eid is in ascending order!
-        -seed    seed to be used. When set to -1, a random seed will be generated each time invoking this example (default:-1)
-       -kname    prints the kernel name when set to 1 (default:0)
-      -warmup    number of iterations before benchmark the kernel (default:5)
-      -repeat    number of iterations to benchmark the kernel (default:20)
-
+                 -v    turn CPU validation on (1) or off (0). (default:1)
+              -pr_i    index data type.  Only int32 is currently supported. (default:int32)
+              -pr_w    output weight data type. Only fp32 is currently supported. (default:fp32)
+                 -t    number of input tokens. (default:128)
+                       If "local_t" presents, this value indicates global concurrency of all ranks.
+           -local_t    Number of local input tokens for curent rank. (default:-1)
+                       This value must be within range "[0, t)", or "-1"(no such feature)
+                       This feature is to simulate EP case where where each rank has different tokens.
+                       Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.
+                 -e    number of num_experts (default:8)
+                 -k    topk (default:4)
+              -unit    unit_size (default:32)
+-moe_buf_interm_dim    interm_dim(col) of the following fmoe buf (default:0)
+-moe_buf_elem_bytes    fmoe buf element byte size, 1:8bit, 2:16bit, 4:32bit... (default:2)
+                -ci    clear workspace inside API or not(if "0", require manually clear outside) (default:1)
+          -dispatch    dispatch policy. 0:automatically pick up kernel, 1:use single kernel, 2:use mp kernel (default:0)
+         -local_eid    a list of experts enabled as local expert. e.g. "0,1,4,5" (default:-1)
+                       please make sure eid is in ascending order!
+              -seed    seed to be used. When set to -1, a random seed will be generated each time invoking this example (default:-1)
+             -kname    prints the kernel name when set to 1 (default:0)
+            -warmup    number of iterations before benchmark the kernel (default:5)
+            -repeat    number of iterations to benchmark the kernel (default:20)
+              -json    0: No Json, 1: Dump Results in Json format (default:0)
+          -jsonfile    json file name to dump results (default:moe_sorting.json)
 ```
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
index e9b4ea5cd3..b2ad4eb98c 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <set>
 #include <vector>
@@ -14,6 +14,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/reduce.hpp"
 #include "moe_sorting_api.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 
 auto create_args(int argc, char* argv[])
 {
@@ -59,7 +60,9 @@ auto create_args(int argc, char* argv[])
                 "invoking this example")
         .insert("kname", "0", "prints the kernel name when set to 1")
         .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "moe_sorting.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -437,6 +440,23 @@ bool test_moe_sorting(ck_tile::ArgParser args)
         printf(", (%d)", seed);
     printf("\n");
     fflush(stdout);
+
+    if(args.get_int("json") == 1)
+    {
+        dump_moe_sorting_json(args.get_str("jsonfile"),
+                              index_prec,
+                              weight_prec,
+                              workspace_size == 0 ? "cx" : (clear_inside ? "ci" : "co"),
+                              dispatch_policy,
+                              tokens,
+                              num_experts,
+                              topk,
+                              ms,
+                              0,
+                              0,
+                              rtn);
+    }
+
     return rtn;
 }
 
diff --git a/example/ck_tile/14_moe_smoothquant/README.md b/example/ck_tile/14_moe_smoothquant/README.md
index c10a922607..f675f4bca9 100644
--- a/example/ck_tile/14_moe_smoothquant/README.md
+++ b/example/ck_tile/14_moe_smoothquant/README.md
@@ -9,7 +9,25 @@ Unlike standard smoothquant op, the input scale is from different expert `[exper
 ```
 # in the root of ck_tile
 mkdir build && cd build
-../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
-make tile_example_moe_smoothquant -j
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_moe_smoothquant -j`nproc`
 ```
 This will result in an executable `build/bin/tile_example_moe_smoothquant`
+
+## example
+```
+args:
+          -t    tokens dimension (default:3328)
+          -h    hidden_size dimension (default:4096)
+          -e    experts (default:32)
+          -k    topk (default:5)
+     -stride    stride per row, if -1 then equal to hidden_size (default:-1)
+          -v    cpu validation or not (default:1)
+      -kname    print kernel name or not (default:1)
+     -prec_i    input precision, fp16/bf16 (default:fp16)
+     -prec_o    precision, int8/fp8 (default:int8)
+     -warmup    cold iter (default:5)
+     -repeat    hot iter (default:20)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:moe_smoothquant.json)
+```
\ No newline at end of file
diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
index 848fb87dcf..9cd77ca5fa 100644
--- a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
@@ -1,5 +1,6 @@
 #include "ck_tile/host.hpp"
 #include "moe_smoothquant.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 #include <cstring>
 #include <set>
 
@@ -66,7 +67,9 @@ auto create_args(int argc, char* argv[])
         .insert("prec_i", "fp16", "input precision, fp16/bf16")
         .insert("prec_o", "int8", "precision, int8/fp8")
         .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("repeat", "20", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "moe_smoothquant.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -244,6 +247,21 @@ bool run(const ck_tile::ArgParser& arg_parser)
         std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
+    if(arg_parser.get_int("json"))
+    {
+        dump_moe_smoothquant_json(arg_parser.get_str("jsonfile"),
+                                  prec_i,
+                                  prec_o,
+                                  tokens,
+                                  hidden_size,
+                                  stride,
+                                  experts,
+                                  topk,
+                                  pass,
+                                  ave_time,
+                                  0,
+                                  gb_per_sec);
+    }
     return pass;
 }
 
diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
index 36cf477a42..eff4bef025 100644
--- a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
@@ -38,54 +38,17 @@ struct moe_smoothquant_traits_
     using InputType  = ck_tile::remove_cvref_t<InputType_>;
     using OutputType = ck_tile::remove_cvref_t<OutputType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
-            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+    using BlockTile      = ck_tile::sequence<Block_M, Block_N>;
+    using Vector         = ck_tile::sequence<1, Vector_N_>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
 
-    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
-    using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN    = kPadN_;
     static constexpr bool kTwoPass = kTwoPass_;
diff --git a/example/ck_tile/15_fused_moe/README.md b/example/ck_tile/15_fused_moe/README.md
index 089e1de78e..1376149177 100644
--- a/example/ck_tile/15_fused_moe/README.md
+++ b/example/ck_tile/15_fused_moe/README.md
@@ -69,4 +69,42 @@ summary of the key design of this fused-moe operator:
 //  4）num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one)
 //
 //   max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1)
+```
+
+## example
+```
+args:
+          -t    number of input tokens. (default:128)
+                If "local_t" presents, this value indicates global concurrency of all ranks.
+    -local_t    Number of local input tokens for curent rank. (default:-1)
+                This value must be within range "[0, t)", or "-1"(no such feature)
+                This feature is to simulate EP case where where each rank has different tokens.
+                Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.
+          -e    num of experts (default:32)
+          -k    topk (default:5)
+          -h    hidden_size of this model (default:8192)
+          -i    intermediate_size between 2 gemms of FFN (default:8192)
+     -stride    stride per row, if -1 then equal to hidden_size (default:-1)
+         -bm    blocking factor for sorted tokens (default:32)
+         -tp    tensor parallel size (default:8)
+          -v    cpu validation or not (default:1)
+      -kname    print kernel name or not (default:1)
+     -prec_i    input precision (default:bf16)
+     -prec_w    weight precision (default:bf16)
+     -prec_o    output precision (default:bf16)
+    -prec_st    token scale data type. auto will set to fp32 (default:auto)
+    -prec_sw    weight scale data type. auto will set to fp32 (default:auto)
+    -prec_sq    (dynamic) smooth quant data type. auto will set to fp32 (default:auto)
+    -prec_kw    topk-weight data type. auto will set to fp32 (default:auto)
+     -fquant    fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant (default:0)
+  -gate_only    w0(gate/up) style, 0:gate+up will double interm size, 1:only gate (default:1)
+        -api    benchmark api set: 0:fused-moe(moe-gemm+moe-sorting), 1:moe-gemm (default:0)
+        -act    activation after first gemm. 0:gelu, 1:silu (default:0)
+    -balance    if set to 1, will try balance the expert in topk-ids(convenient for testing) (default:0)
+       -init    init method. 0:random stepped float(fast). 1: random uniform[-0.5, 0.5], 2:rand normalized[0, 1]normalized(slow) (default:1)
+       -seed    seed used to do random (default:11939)
+     -warmup    cold iter (default:5)
+     -repeat    hot iter (default:20)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:fused_moe.json)
 ```
\ No newline at end of file
diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp
index e4d87e5fef..5129b46231 100644
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include <algorithm>
 #include <cstring>
 #include <unordered_set>
@@ -5,6 +8,7 @@
 #include <set>
 
 #include "ck_tile/host.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 #include "fused_moe.hpp"
 
 // different threshold for different dtype
@@ -130,7 +134,9 @@ auto create_args(int argc, char* argv[])
                 "normalized(slow)")
         .insert("seed", "11939", "seed used to do random")
         .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("repeat", "20", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "fused_moe.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -513,6 +519,29 @@ bool run(const ck_tile::ArgParser& arg_parser)
             std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
         }
         std::cout << std::flush << std::endl;
+
+        if(arg_parser.get_int("json") == 1)
+        {
+            dump_fused_moe_json(arg_parser.get_str("jsonfile"),
+                                api_str,
+                                prec_str,
+                                tokens,
+                                is_local_token,
+                                local_tokens,
+                                experts,
+                                topk,
+                                hidden_size,
+                                intermediate_size,
+                                stride,
+                                block_m,
+                                activation,
+                                gate_only,
+                                fused_quant,
+                                pass,
+                                ave_time,
+                                cal_tflops(ave_time),
+                                cal_tbps(ave_time));
+        }
         return pass;
     }
     else if(api == 1)
@@ -619,6 +648,29 @@ bool run(const ck_tile::ArgParser& arg_parser)
         }
         std::cout << std::flush << std::endl;
 
+        if(arg_parser.get_int("json") == 1)
+        {
+            dump_fused_moe_json(arg_parser.get_str("jsonfile"),
+                                api_str,
+                                prec_str,
+                                tokens,
+                                is_local_token,
+                                local_tokens,
+                                experts,
+                                topk,
+                                hidden_size,
+                                intermediate_size,
+                                stride,
+                                block_m,
+                                activation,
+                                gate_only,
+                                fused_quant,
+                                pass,
+                                ave_time,
+                                cal_tflops(ave_time),
+                                cal_tbps(ave_time));
+        }
+
         return pass;
     }
     return false;
diff --git a/example/ck_tile/16_batched_gemm/README.md b/example/ck_tile/16_batched_gemm/README.md
index 8a64a3912c..d82f20eb2b 100644
--- a/example/ck_tile/16_batched_gemm/README.md
+++ b/example/ck_tile/16_batched_gemm/README.md
@@ -15,23 +15,25 @@ This will result in an executable `build/bin/tile_example_batched_gemm`
 ## example
 ```
 args:
-              -m     m dimension (default:256)
-              -n     n dimension (default:128)
-              -k     k dimension (default:128)
-       -a_layout     A tensor data layout (default:R) (R for Row, C for Col)
-       -b_layout     B tensor data layout (default:R) (R for Row, C for Col)
-       -c_layout     C tensor data layout (default:R) (R for Row, C for Col)
-       -stride_a     Tensor A stride (default:128)
-       -stride_b     Tensor B stride (default:128)
-       -stride_c     Tensor C stride (default:128)
- -batch_stride_a     Batch A stride (default:32768)
- -batch_stride_b     Batch B stride (default:16384)
- -batch_stride_c     Batch C stride (default:32768)
-    -batch_count     Batch count (default:16)
-              -v     0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
-              -e     Absolute error tolerance (default:1e-5)
-           -prec     data type. fp16/bf16/fp8/bf8 (default:fp16)
-         -warmup     number of iterations before benchmark the kernel (default:10)
-         -repeat     number of iterations to benchmark the kernel (default:100)
-          -timer     gpu:gpu timer, cpu:cpu timer (default:gpu)
+               -m    m dimension (default:512)
+               -n    n dimension (default:1024)
+               -k    k dimension (default:2048)
+         -stride_a    Tensor A stride (default:0)
+         -stride_b    Tensor B stride (default:0)
+         -stride_c    Tensor C stride (default:0)
+         -a_layout    A tensor data layout - Row by default (default:R)
+         -b_layout    B tensor data layout - Row by default (default:C)
+         -c_layout    C tensor data layout - Row by default (default:R)
+   -batch_stride_a    Batch A stride (default:1048576)
+   -batch_stride_b    Batch B stride (default:2097152)
+   -batch_stride_c    Batch C stride (default:524288)
+      -batch_count    Batch count (default:8)
+               -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+            -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+         -warmup    number of iterations before benchmark the kernel (default:50)
+         -repeat    number of iterations to benchmark the kernel (default:100)
+            -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+         -split_k    splitK value (default:1)
+            -json    0: No Json, 1: Dump Results in Json format (default:0)
+         -jsonfile    json file name to dump results (default:cktile_batched_gemm.json)
 ```
\ No newline at end of file
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
index 09ba010e00..1164e28125 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -15,7 +15,8 @@
 #include "ck_tile/host.hpp"
 #include "batched_gemm.hpp"
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -27,54 +28,19 @@ template <typename ADataType,
           typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s)
 {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 32;
-    constexpr ck_tile::index_t K_Tile = 64;
+    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
+    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
+    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
 
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 1;
-    constexpr ck_tile::index_t K_Warp = 1;
+    constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp;
+    constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp;
+    constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp;
 
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
+    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
+    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
+    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
 
-    constexpr bool DoubleSmemBuffer = false;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = true;
-#endif
+    constexpr bool DoubleSmemBuffer = GemmConfig::DoubleSmemBuffer;
 
     constexpr bool kPadM = false;
     constexpr bool kPadN = false;
@@ -105,7 +71,8 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
-    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
 
     const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
     const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
@@ -119,7 +86,7 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
         [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
             constexpr bool has_hot_loop_v   = has_hot_loop_.value;
             constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto scheduler        = GemmConfig::Scheduler;
             constexpr auto memory_operation = memory_operation_.value;
 
             using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
@@ -131,7 +98,8 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
                                                                                has_hot_loop_v,
                                                                                tail_number_v>;
 
-            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+            using GemmPipeline = typename PipelineTypeTraits<
+                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
 
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
@@ -207,7 +175,11 @@ int main(int argc, char* argv[])
 {
     try
     {
-        return !run_batched_gemm_example(argc, argv);
+#if CK_TILE_USE_WMMA
+        return !run_batched_gemm_example<GemmConfigV3_Wmma>(argc, argv);
+#else
+        return !run_batched_gemm_example<GemmConfigV3>(argc, argv);
+#endif
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
index 78d915e873..33da0bf0a5 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.hpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
@@ -9,30 +9,122 @@
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 
 #define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 
-#ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
-#endif
+struct GemmConfigMemory
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 64;
 
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#else
-#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
-#endif
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+struct GemmConfigV3
+{
+    // Compute friendly for Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+struct GemmConfigV4
+{
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 32;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+struct GemmConfigV3_Wmma
+{
+    // Compute friendly for Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
 
 template <typename DataType>
 struct BatchedGemmTypeConfig;
@@ -70,12 +162,14 @@ auto create_args(int argc, char* argv[])
         .insert("batch_stride_b", "2097152", "Batch B stride")
         .insert("batch_stride_c", "524288", "Batch C stride")
         .insert("batch_count", "8", "Batch count")
-        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
         .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
-        .insert("split_k", "1", "splitK value");
+        .insert("split_k", "1", "splitK value")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "cktile_batched_gemm.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
index 6d26cfe675..8c5249fdad 100644
--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -2,7 +2,6 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
-
 auto calculate_rtol_atol(const ck_tile::index_t K,
                          const ck_tile::index_t kbatch,
                          const float max_accumulated_value)
@@ -23,7 +22,8 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -65,7 +65,8 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                                       batch_stride_C,
                                       batch_count};
 
-    float ave_time = batched_gemm<ADataType,
+    float ave_time = batched_gemm<GemmConfig,
+                                  ADataType,
                                   BDataType,
                                   DsDataType,
                                   AccDataType,
@@ -77,25 +78,10 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                                   CDEElementWise>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
-    std::string op_name{"Batched Gemm"};
-    std::size_t flop     = std::size_t(2) * batch_count * M * N * K;
-    std::size_t num_byte = sizeof(ADataType) * batch_count * M * K +
-                           sizeof(BDataType) * batch_count * N * K +
-                           sizeof(CDataType) * batch_count * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
-              << " batch_stride_A =" << batch_stride_A << " batch_stride_B =" << batch_stride_B
-              << " batch_stride_C =" << batch_stride_C << " batch_count =" << batch_count << " : "
-              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << std::endl;
-
     return ave_time;
 }
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename GemmConfig, typename ALayout, typename BLayout, typename CLayout>
 int run_batched_gemm_example_with_layouts(int argc,
                                           char* argv[],
                                           const ALayout a_layout                  = ALayout{},
@@ -186,31 +172,48 @@ int run_batched_gemm_example_with_layouts(int argc,
     c_m_n_dev_buf.SetZero();
     c_m_n_dev_result.SetZero();
 
-    invoke_batched_gemm<ADataType,
-                        BDataType,
-                        ck_tile::tuple<>,
-                        AccDataType,
-                        CDataType,
-                        ALayout,
-                        BLayout,
-                        ck_tile::tuple<>,
-                        CLayout>(a_m_k_dev_buf,
-                                 b_k_n_dev_buf,
-                                 c_m_n_dev_buf,
-                                 M,
-                                 N,
-                                 K,
-                                 stride_A,
-                                 stride_B,
-                                 stride_C,
-                                 batch_stride_A,
-                                 batch_stride_B,
-                                 batch_stride_C,
-                                 batch_count,
-                                 kbatch,
-                                 n_warmup,
-                                 n_repeat);
+    float ave_time = invoke_batched_gemm<GemmConfig,
+                                         ADataType,
+                                         BDataType,
+                                         ck_tile::tuple<>,
+                                         AccDataType,
+                                         CDataType,
+                                         ALayout,
+                                         BLayout,
+                                         ck_tile::tuple<>,
+                                         CLayout>(a_m_k_dev_buf,
+                                                  b_k_n_dev_buf,
+                                                  c_m_n_dev_buf,
+                                                  M,
+                                                  N,
+                                                  K,
+                                                  stride_A,
+                                                  stride_B,
+                                                  stride_C,
+                                                  batch_stride_A,
+                                                  batch_stride_B,
+                                                  batch_stride_C,
+                                                  batch_count,
+                                                  kbatch,
+                                                  n_warmup,
+                                                  n_repeat);
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+
+    std::string op_name{"Batched Gemm"};
+    std::size_t flop     = std::size_t(2) * batch_count * M * N * K;
+    std::size_t num_byte = sizeof(ADataType) * batch_count * M * K +
+                           sizeof(BDataType) * batch_count * N * K +
+                           sizeof(CDataType) * batch_count * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
+              << " batch_stride_A =" << batch_stride_A << " batch_stride_B =" << batch_stride_B
+              << " batch_stride_C =" << batch_stride_C << " batch_count =" << batch_count << " : "
+              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
     bool pass = true;
 
     if(arg_parser.get_int("v") == 1)
@@ -246,23 +249,9 @@ int run_batched_gemm_example_with_layouts(int argc,
         c_m_n_gpu_ref.SetZero();
         c_m_n_gpu_buf_ref.SetZero();
 
-        ADataType* d_A;
-        BDataType* d_B;
-        CDataType* d_C;
-
-        ck_tile::hip_check_error(hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType)));
-        ck_tile::hip_check_error(hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType)));
-        ck_tile::hip_check_error(hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType)));
-
-        ck_tile::hip_check_error(hipMemcpy(d_A,
-                                           a_m_k_dev_buf.GetDeviceBuffer(),
-                                           batch_count * M * K * sizeof(ADataType),
-                                           hipMemcpyHostToDevice));
-
-        ck_tile::hip_check_error(hipMemcpy(d_B,
-                                           b_k_n_dev_buf.GetDeviceBuffer(),
-                                           batch_count * N * K * sizeof(BDataType),
-                                           hipMemcpyHostToDevice));
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
 
         ck_tile::reference_batched_gemm_gpu<ADataType,
                                             BDataType,
@@ -284,15 +273,6 @@ int run_batched_gemm_example_with_layouts(int argc,
                                                      batch_stride_C,
                                                      batch_count);
 
-        ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
-                                           d_C,
-                                           batch_count * M * N * sizeof(CDataType),
-                                           hipMemcpyDeviceToHost));
-
-        ck_tile::hip_check_error(hipFree(d_A));
-        ck_tile::hip_check_error(hipFree(d_B));
-        ck_tile::hip_check_error(hipFree(d_C));
-
         c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
         const float max_accumulated_value =
             *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
@@ -310,9 +290,31 @@ int run_batched_gemm_example_with_layouts(int argc,
         std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_batched_gemm_json_results(arg_parser.get_str("jsonfile"),
+                                       op_name,
+                                       M,
+                                       N,
+                                       K,
+                                       stride_A,
+                                       stride_B,
+                                       stride_C,
+                                       batch_stride_A,
+                                       batch_stride_B,
+                                       batch_stride_C,
+                                       batch_count,
+                                       pass,
+                                       ave_time,
+                                       tflops,
+                                       gb_per_sec,
+                                       "batched_gemm");
+    }
+
     return pass;
 }
 
+template <typename GemmConfig>
 int run_batched_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -331,7 +333,7 @@ int run_batched_gemm_example(int argc, char* argv[])
     // }
     if(a_layout == "R" && b_layout == "C")
     {
-        return run_batched_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
+        return run_batched_gemm_example_with_layouts<GemmConfig>(argc, argv, Row{}, Col{}, Row{});
     }
     // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not
     // work else if(a_layout == "C" && b_layout == "C")
diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
index 475c13166d..bbfb2df006 100644
--- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -1 +1,12 @@
 add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
+add_executable(tile_example_quant_grouped_gemm EXCLUDE_FROM_ALL quant_grouped_gemm.cpp)
+add_executable(tile_example_grouped_gemm_preshuffle EXCLUDE_FROM_ALL grouped_gemm_preshuffle.cpp)
+add_executable(tile_example_grouped_gemm_multi_d EXCLUDE_FROM_ALL grouped_gemm_multi_d.cpp)
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+target_compile_options(tile_example_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+target_compile_options(tile_example_grouped_gemm_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+target_compile_options(tile_example_grouped_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+target_compile_options(tile_example_quant_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
\ No newline at end of file
diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md
index 8715ee79e1..09bf3e167a 100644
--- a/example/ck_tile/17_grouped_gemm/README.md
+++ b/example/ck_tile/17_grouped_gemm/README.md
@@ -1,147 +1,37 @@
-# Grouped Gemm
-
-Grouped General Matrix Multiplication (Grouped GEMM) is a technique used in GPU computing and high-performance computing to batch together multiple independent GEMM operations (matrix multiplications) into a single kernel launch in order to improve performance and efficiency. This folder contains Grouped GEMM examples that use the ck_tile tile-programming implementation.  
-
 ## Quick Tour for New Users
 
 The `Grouped GEMM` operators are versions of GEMM that run multiple GEMM operations within a single kernel call. Each GEMM operation performs a matrix multiplication. Unlike regular batched GEMM operations where both matrices must be of the same size and have the same configuration, Grouped GEMM operations can take matrices with different sizes and configurations, making them more flexible for diverse workloads.
 
-Let's now break the example into the following parts: parsing arguments, preparing host and device buffers, preparing data, invoking GEMM, and building the example, while explaining each function.
+### Preshuffle and Persistence
 
-### Parsing Arguments
-The example takes three arguments: `group_count`, `repeat`, and `warmup`:
-- `group_count`: the number of GEMM operations in the group, 
-- `repeat`: the number of times to repeat the kernel for benchmarking
-- `warmup`: the number of iterations before the actual kernel run time measure.
+The grouped GEMM examples include two advanced optimization features:
 
-```cpp
-// Example
-const int group_count = arg_parser.get_int("group_count");
-const int repeat      = arg_parser.get_int("repeat");
-const int warmup      = arg_parser.get_int("warmup");
-```
-In the next step, the input parameters `Ms`, `Ns`, `Ks`, as well as the corresponding `stride_As`, `stride_Bs`, and `stride_Cs` are either provided from the comand line or generated by default. Since one or more input data sets are expected for `A` and `B`, each parameter is stored in a `std::vector`. The size of the `vector` is defined by `group_count`.
+#### Weight Preshuffle
+Weight preshuffle is an optimization technique that reorganizes the B matrix (weights) in memory to improve data access patterns and reduce memory bandwidth requirements. This is particularly beneficial for inference workloads where the same weights are reused across multiple batches.
 
-```cpp
-// Example
-std::vector<ck_tile::index_t> Ms        = arg_parser.get_int_vec("Ms");
-std::vector<ck_tile::index_t> Ns        = arg_parser.get_int_vec("Ns");
-std::vector<ck_tile::index_t> Ks        = arg_parser.get_int_vec("Ks");
-std::vector<ck_tile::index_t> stride_As = arg_parser.get_int_vec("stride_As");
-std::vector<ck_tile::index_t> stride_Bs = arg_parser.get_int_vec("stride_Bs");
-std::vector<ck_tile::index_t> stride_Cs = arg_parser.get_int_vec("stride_Cs");
-```
-Where:
-- `Ms` is the M dimension of each GEMM.
-- `Ns` is the N dimension of each GEMM.
-- `Ks` is the K dimension of each GEMM.
-- `stride_As` is the stride values for matrix A.
-- `stride_Bs` is the stride  values for matrix B.
-- `stride_Cs` is the stride  values for matrix C.
+- **Implementation**: Available in `grouped_gemm_preshuffle.cpp` 
+- **Configuration**: Uses `GemmConfigPreshuffleDecode` and `GemmConfigPreshufflePrefill` template configuration
+- **Constraints**: Currently supports only A(Row major) + B(Column major) → C(Row major) layouts
 
-### HostTensor and Device Memory Buffers (for CPU and GPU) 
-Each parameter `Ms`, `Ns`, `Ks`, `stride_As`, `stride_Bs` and `stride_Cs` contains values for more than one matrix, meaning different matrix sizes and strides can be used for different grouped GEMM computations.
-The next step is to properly load the input values. For each input matrix, `A` and `B`, and for each output matrix, `C`, you need to create both `HostTensor` and `DeviceMemory`, where: 
-- `HostTensor` represents the matrix data on the host (CPU). It stores the data before they are transferred to the device for computation.
-- `DeviceMemory` represents the matrix data on the device (GPU). This will store the data on the GPU for computation during the Grouped GEMM operation.
 
-#### HostTensor Buffers (for CPU)
-In the first step, create `HostTensor` for `A`, `B`, `C`. `HostTensor` allocates memory on the host (CPU) to store the matrices, initializing the memory with the appropriate dimensions and values to store the data. Below is an example code showing how to create HostTensors for those tensors:
-```cpp
-// Example
-std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
-std::vector<ck_tile::HostTensor<BDataType>> b_k_n_tensors;
-std::vector<ck_tile::HostTensor<CDataType>> c_m_n_tensors;
-```
-Where:
-- `a_m_k_tensors` is the vector of `HostTensor` objects for matrix `A` (with dimensions `M × K`). Each tensor stores the data for single GEMM operation.
-- `b_k_n_tensors` is the vector of `HostTensor` objects for matrix `B` (with dimensions `K × N`).
-- `c_m_n_tensors` is the vector of `HostTensor` objects for matrix `C` (the output matrix with dimensions `M × N`).
+#### Persistence Mode
+Persistence mode is a GPU optimization where thread blocks remain active on the compute units to process multiple work items sequentially, reducing kernel launch overhead and improving occupancy.
 
-The `std::vector` container is used for this purpose throughout. As mentioned above, the number of HostTensors is equal to `group_count`.
+- **Template Parameter**: Controlled by the `Persistent` boolean template parameter in `invoke_gemm`
+- **Usage**: `invoke_gemm<ALayout, BLayout, CLayout, true>` enables persistence
 
-#### Device Memory Buffers (for GPU)
-Now it's time to allocate memory on the device (GPU) and transfer the data from `HostTensor` to `DeviceMemory` for actual computation..
-```cpp
-// Example
-std::vector<std::unique_ptr<ck_tile::DeviceMem>> a_m_k_dev_buf;
-std::vector<std::unique_ptr<ck_tile::DeviceMem>> b_k_n_dev_buf;
-std::vector<std::unique_ptr<ck_tile::DeviceMem>> c_m_n_dev_buf;
-``` 
-Where:
-- `a_m_k_dev_buf` is the buffer used for storing matrix A on the GPU.
-- `b_k_n_dev_buf` is the buffer used for storing matrix B on the GPU.
-- `c_m_n_dev_buf` is the buffer used for storing the result matrix C on the GPU.
+#### Multi-D Operations
+Multi-D operations extend the standard GEMM operation by supporting additional element-wise operations on the result tensor. This feature is particularly useful for workloads that require post-processing of the GEMM output.
 
-## Prepare data
-In the next step, the input tensors are populated. A pseudorandom number generator, an existing distribution (e.g., `FillUniformDistribution`), or user data can be used to populate the tensors. Descriptors also need to be create for each input tensor.
+- **Implementation**: Available in `grouped_gemm_multi_d.cpp`
+- **Operation**: E = C × D₀ × D₁ (where C = A × B is the standard GEMM result)
+- **Configuration**: Uses `GemmConfigV3`, `GemmConfigV4`, `GemmConfigMemory` template configuration with 2 D tensors
+- **Data Types**: Supports fp16 
+- **Benefits**: Enables complex operations like scaling, activation functions, or other element-wise transformations in a single kernel call
+- **Build Target**: `make tile_example_grouped_gemm_multi_d -j`
 
-Use `get_default_stride` to get the strides for A, B, and C. `get_default_stride` is a template function that calculates the default stride for a 2D array based on whether it is row-major or column-major. Template parameter determines whether the storage order is row-major (true) or column-major (false). The function takes four params `row`, `col`, `stride` and `bool_constant<is_row_major>`. If the stride is explicitly provided (`stride != 0`), the stride is returned as-is. If the stride is not provided (`stride == 0`), the function computes the default stride. For the Row-major order (`is_row_major == true`), the stride is set to the number of columns (col). For the column-major order (`is_row_major == false`), the stride is set to the number of rows (row). This function is useful when working with dynamically allocated 2D arrays, where the user may not specify the stride explicitly. It ensures a natural default stride based on the chosen storage order.
-
-```cpp
-// Example, API
-template <bool is_row_major>
-auto get_default_stride(std::size_t row, std::size_t col, std::size_t stride, bool_constant<is_row_major>) {
-  // code
-}
-```
-
-Where: 
-- `is_row_major` is a bool template parameter that determines whether the storage order is row-major (true) or column-major (false).
-- `row` is the number of rows in the matrix.
-- `col` is the number of columns in the matrix.
-- `stride` is the current stride (the distance between consecutive elements in memory).
-- `bool_constant<is_row_major>` is a tag type that helps in differentiating behavior at compile-time.
-
-Next host descriptors for each of the input tensors, A, B, and C are created. Use the `f_host_tensor_descriptor` function defined below. This function takes four parameters, row, col, stride, and layout, and returns a HostTensorDescriptor based on the specified layout.
-
-```cpp
-// Example for tensor A
-ck_tile::HostTensor<ADataType>(f_host_tensor_descriptor(M, K, stride_As[i], a_layout)))
-```
-
-After creating the host_tensors, create `deviceMem` for each tensor `A`, `B`, and `C`, and then transfer the data to the device. The `get_element_space_size_in_bytes()` function is used to get the buffer size in bytes. Use `ToDevice()` to transfer data from the host to the device. The data that was previously generated (`a_m_k_tensors[i].data()`) is passed as a parameter to `ToDevice()`.
-
-The final step before running the GEMM operation is to retrieve the pointers to the buffers of `A`, `B`, and `C` stored on the device using `->GetDeviceBuffer()` and pack them into a shared container. For example: `gemm_descs.push_back({p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]})`, where `gemm_descs` is `std::vector<grouped_gemm_kargs> gemm_descs` ([Code](https://github.com/ROCm/composable_kernel/blob/develop/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc#L221)). The container should include values such as:
-```cpp
-struct GroupedGemmHostArgs
-{
-    const void* a_ptr;
-    const void* b_ptr;
-    void* c_ptr;
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t stride_A;
-    index_t stride_B;
-    index_t stride_C;
-};
-```
-The data prepared in this way can be passed to the `invoke_gemm` function. This is a templated function that also takes three template parameters: `ALayout`, `BLayout`, and `CLayout`:
-```cpp
-// Example, API
-template <typename ALayout, typename BLayout, typename CLayout, bool Persistent>
-float invoke_gemm(int n_warmup,
-                  int n_repeat,
-                  int group_count,
-                  const std::vector<grouped_gemm_kargs>& args)
-```
-`invoke_gemm` returns the run time in milliseconds. The workspace memory required for computation is allocated. Workspace memory on the GPU refers to temporary memory buffers allocated when some operations are run. This extra space is needed to hold GEMM descriptions. The following structure can be used to allocate workspace:
-
-```cpp
-// Example
-ck_tile::DeviceMem gemm_workspace;
-gemm_workspace.Realloc(GetWorkspaceSize(args));
-```
-Finally the arguments are passed to group_gemm and the kernel is launched.
-```cpp
-// API
-template <typename ALayout, typename BLayout, typename CLayout>
-float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
-                   const ck_tile::stream_config& s,
-                   void* kargs_ptr)
-```
-All the necessary parameters are set, the tiling is computed, the GEMM pipeline and epilogue are prepared, and the GroupedGemmKernel is launched.
+Multi-D operations supports both persistence and non-persistence modes.
+Weight preshuffle supports only on non-persistence mode.
 
 ## Build
 ```
@@ -151,8 +41,15 @@ mkdir build && cd build
 ../script/cmake-ck-dev.sh  ../ <arch>
 # The basic pipeline method on the gemm calculation
 make tile_example_grouped_gemm -j
+# The preshuffle example
+make tile_example_grouped_gemm_preshuffle -j
+# The multi-D operations example
+make tile_example_grouped_gemm_multi_d -j
+# The quant grouped gemm fp8 example
+make tile_example_quant_grouped_gemm -j
 ```
-This will result in an executable `build/bin/tile_example_grouped_gemm`
+Each example will result in an corresponding executable `build/bin/tile_example_grouped_gemm`, `build/bin/tile_example_grouped_gemm_preshuffle`, `build/bin/tile_example_grouped_gemm_multi_d`, and `build/bin/tile_example_quant_grouped_gemm`.
+
 
 ## example
 ```
@@ -166,8 +63,25 @@ args:
  -a_layout    A tensor data layout - (Default: Row).
  -b_layout    B tensor data layout - (Default: Col).
  -c_layout    C tensor data layout - (Default: Row).
+ -prec        data type. fp16/fp8 - (Default: fp16).
  -validate    0. No validation, 1. Validation on CPU. (Default: 1).
  -warmup      Number of iterations before benchmark the kernel. (Default: 10).
  -repeat      Number of iterations to benchmark the kernel. (Default: 100).
  -group_count Group count. (Default: 16).
+ -kbatch      kbatch for SplitK (Default: 1).
+ -json        0: No Json, 1: Dump Results in Json format (Default: 0).
+ -jsonfile    json file name to dump results (Default: grouped_gemm.json).
 ```
+
+If any of `Ms`, `Ns`, `Ks`, `stride_As`, `stride_Bs`, or `stride_Cs` are missing or their sizes
+don't match `group_count`, the example generates defaults per group index `i` (0-based):
+
+```text
+M[i] = 256 + 256 * i
+N[i] = 256 + 512 * i
+K[i] = 512 + 384 * i
+
+stride_A[i] = K[i]
+stride_B[i] = K[i]
+stride_C[i] = N[i]
+```
\ No newline at end of file
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index 527ef1e466..606d98d9e2 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -16,6 +16,155 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"
 
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
+                   const ck_tile::stream_config& s,
+                   void* kargs_ptr)
+{
+
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
+
+    using Traits              = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                                        GemmConfig::kPadN,
+                                                        GemmConfig::kPadK,
+                                                        ALayout,
+                                                        BLayout,
+                                                        CLayout>;
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 GemmConfig::TransposeC>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain = gemm_descs[0].k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split = (gemm_descs[0].K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             CLayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
+        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKargs(gemm_descs);
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Kernel arguments not supported!");
+        }
+
+        const dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(gemm_descs);
+
+        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
+                                            kargs.data(),
+                                            get_workspace_size(gemm_descs),
+                                            hipMemcpyHostToDevice,
+                                            s.stream_id_));
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
+                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
+                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
+        }
+
+        ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
+                                       Kernel{},
+                                       grids,
+                                       blocks,
+                                       0,
+                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                                       gemm_descs.size()));
+
+        return ave_time;
+    };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(gemm_descs[0].k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+
+    return ave_time;
+}
+
 template <typename GemmConfig,
           typename ALayout,
           typename BLayout,
@@ -29,16 +178,15 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
                             void* kargs_ptr,
                             bool splitk)
 {
-    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
-    constexpr ck_tile::index_t TileParitionerM01      = 4;
-
     using GemmShape = ck_tile::TileGemmShape<
         ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
         ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
         ck_tile::
             sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
-    using TilePartitioner = ck_tile::
-        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
 
     using GemmUniversalTraits =
         ck_tile::PersistentTileGemmUniversalTraits<GemmConfig::kPadM,
@@ -124,8 +272,92 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
 
 #include "run_grouped_gemm_example.inc"
 
-constexpr bool Persistent = true;
+template <typename GemmConfig, typename PrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row   = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col   = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using Types = GemmTypeConfig<PrecType>;
+    // Specific type aliases for easy access
+    using ADataType   = typename Types::ADataType;
+    using BDataType   = typename Types::BDataType;
+    using AccDataType = typename Types::AccDataType;
+    using CDataType   = typename Types::CDataType;
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Row{}, Col{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Row{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Col{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A and B tensors!");
+    }
+}
+
+template <template <typename PrecType> typename GemmConfig>
+int run_grouped_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+
+    const std::string a_layout  = arg_parser.get_str("a_layout");
+    const std::string b_layout  = arg_parser.get_str("b_layout");
+    const std::string data_type = arg_parser.get_str("prec");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, ck_tile::fp8_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type configuration.");
+    }
+}
+
 int main(int argc, char* argv[])
 {
-    return !run_grouped_gemm_example<Persistent, GemmConfigComputeV4>(argc, argv);
+#if CK_TILE_USE_WMMA
+    return !run_grouped_gemm_example<GemmConfigComputeV4_Wmma>(argc, argv);
+#else
+    return !run_grouped_gemm_example<GemmConfigComputeV4>(argc, argv) ||
+           !run_grouped_gemm_example<GemmConfigComputeV3_2>(argc, argv) ||
+           !run_grouped_gemm_example<GemmConfigComputeV4_V2>(argc, argv);
+#endif
 }
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index e992cb3118..10d7befc06 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -4,19 +4,17 @@
 #pragma once
 
 #include <string>
+#include <tuple>
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/gemm.hpp"
-#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+#include "ck_tile/utility/json_dump.hpp"
 
 #define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
-
-#ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
-#endif
+#define CK_TILE_PIPELINE_PRESHUFFLE_V2 4
 
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile()
@@ -36,6 +34,22 @@ constexpr ck_tile::index_t get_k_warp_tile()
 #endif
 }
 
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+{
+#if defined(CK_GFX950_SUPPORT)
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 64;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 128;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 32;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 64;
+#endif
+}
+
 template <typename DataType>
 struct GemmTypeConfig;
 
@@ -76,6 +90,8 @@ struct GemmConfigBase
     static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
     static constexpr ck_tile::index_t NumWaveGroups = 1;
     static constexpr bool Preshuffle                = false;
+    static constexpr bool Persistent                = true;
+    static constexpr bool DoubleSmemBuffer          = false;
 };
 
 template <typename PrecType>
@@ -122,6 +138,123 @@ struct GemmConfigComputeV4 : public GemmConfigBase
     static constexpr int kBlockPerCu = 2;
 };
 
+template <typename PrecType>
+struct GemmConfigComputeV4_V2 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshuffleDecode : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr bool kPadK = true;
+
+    static constexpr int kBlockPerCu           = 1;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = true;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufflePrefill : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr int kBlockPerCu           = 2;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr bool kPadK                = true;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4_Wmma : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshuffleDecode_Wmma : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 32 / sizeof(PrecType);
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool kPadK = true;
+
+    static constexpr int kBlockPerCu           = 1;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = true;
+};
+
 template <ck_tile::index_t PipelineId>
 struct PipelineTypeTraits;
 
@@ -152,9 +285,19 @@ struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
     using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
 };
 
-using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE_V2>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline =
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+};
 
-auto create_args(int argc, char* argv[])
+using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs<>;
+
+std::pair<bool, ck_tile::ArgParser> create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("Ms", "", "M dimensions - empty by default.")
@@ -171,18 +314,64 @@ auto create_args(int argc, char* argv[])
         .insert("warmup", "10", "number of iterations before benchmark the kernel.")
         .insert("repeat", "100", "number of iterations to benchmark the kernel.")
         .insert("group_count", "8", "group count.")
-        .insert("kbatch", "1", "kbatch for SplitK");
+        .insert("kbatch", "1", "kbatch for SplitK")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "grouped_gemm.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
+    return std::make_pair(result, arg_parser);
 }
 
 inline std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
 {
-    return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
+    return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg<>);
 }
 
-template <typename ADataType,
+template <typename GemmConfig, typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_ = t.get_lengths()[1];
+    int k_ = t.get_lengths()[0];
+
+    if(ck_tile::is_gfx12_supported())
+    {
+        constexpr int divisor      = 2;
+        constexpr int kABK1PerLane = 8;
+        constexpr int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       kABK0PerLane,
+                                       divisor,
+                                       kABK1PerLane});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
+    }
+    else
+    {
+        int divisor = 1;
+        if(ck_tile::is_gfx11_supported())
+        {
+            divisor = 1;
+        }
+        else
+        {
+            assert(is_wave32() == false);
+            divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+        }
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       divisor,
+                                       GemmConfig::K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+}
+
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -191,7 +380,6 @@ template <typename ADataType,
           typename BLayout,
           typename DsLayout,
           typename CLayout,
-          bool Persistent,
           typename CDEElementWise>
 float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                    const ck_tile::stream_config& s,
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_multi_d.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_multi_d.cpp
new file mode 100644
index 0000000000..98b0428d39
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm_multi_d.cpp
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <memory>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "grouped_gemm_multi_d.hpp"
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename CDEElementWise>
+float grouped_gemm_multi_d(const std::vector<grouped_gemm_multi_d_kargs>& gemm_descs,
+                           const ck_tile::stream_config& s,
+                           void* kargs_ptr)
+{
+
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
+
+    using Traits              = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                                        GemmConfig::kPadN,
+                                                        GemmConfig::kPadK,
+                                                        ALayout,
+                                                        BLayout,
+                                                        ELayout>;
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain = gemm_descs[0].k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split = (gemm_descs[0].K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             EDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
+
+        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKargs(gemm_descs);
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Kernel arguments not supported!");
+        }
+
+        const dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(gemm_descs);
+
+        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
+                                            kargs.data(),
+                                            get_workspace_size(gemm_descs),
+                                            hipMemcpyHostToDevice,
+                                            s.stream_id_));
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: { "
+                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
+                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
+        }
+
+        ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
+                                       Kernel{},
+                                       grids,
+                                       blocks,
+                                       0,
+                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                                       gemm_descs.size()));
+
+        return ave_time;
+    };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(gemm_descs[0].k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename CDEElementWise>
+float grouped_gemm_multi_d_tileloop(const ck_tile::stream_config& s,
+                                    const ck_tile::index_t num_groups,
+                                    void* kargs_ptr,
+                                    bool splitk)
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
+    using GemmUniversalTraits =
+        ck_tile::PersistentTileGemmUniversalTraits<GemmConfig::kPadM,
+                                                   GemmConfig::kPadN,
+                                                   GemmConfig::kPadK,
+                                                   GemmConfig::DoubleSmemBuffer,
+                                                   ALayout,
+                                                   BLayout,
+                                                   ELayout>;
+
+    float ave_time{0};
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        // We create the GEMM pipeline without specifying hotloop or tailnumber.
+        // These are automatically run inside the kernel based on the given input data.
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             EDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
+        using Kernel      = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        const dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::MaxOccupancyGridSize(s);
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
+                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
+                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
+        }
+
+        ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
+                                       Kernel{},
+                                       grids,
+                                       blocks,
+                                       0,
+                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                                       num_groups));
+
+        return ave_time;
+    };
+    if(!splitk)
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::atomic_add>{});
+    }
+
+    return ave_time;
+}
+
+#include "run_grouped_gemm_multi_d_example.inc"
+
+int main(int argc, char* argv[])
+{
+#if CK_TILE_USE_WMMA
+    return !run_grouped_gemm_multi_d_example<GemmConfigV3_Wmma>(argc, argv);
+#else
+    return !run_grouped_gemm_multi_d_example<GemmConfigV3>(argc, argv) ||
+           !run_grouped_gemm_multi_d_example<GemmConfigMemory>(argc, argv) ||
+           !run_grouped_gemm_multi_d_example<GemmConfigV4>(argc, argv);
+#endif
+}
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_multi_d.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_multi_d.hpp
new file mode 100644
index 0000000000..d5203a799c
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm_multi_d.hpp
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+
+using ADataType   = ck_tile::half_t;
+using BDataType   = ck_tile::half_t;
+using D0DataType  = ck_tile::half_t;
+using D1DataType  = ck_tile::half_t;
+using EDataType   = ck_tile::half_t;
+using DsDataType  = ck_tile::tuple<D0DataType, D1DataType>;
+using AccDataType = float;
+
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(CK_GFX950_SUPPORT)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
+#endif
+}
+
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool TransposeC = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr bool Preshuffle = false; // currently preshuffle == true is not supported yet
+    static constexpr bool Persistent = false; // currently persistent == true is not supported yet
+    static constexpr bool DoubleSmemBuffer =
+        false; // currently double smem buffer == true is not supported yet
+};
+
+struct GemmConfigMemory : public GemmConfigBase
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+struct GemmConfigV3 : public GemmConfigBase
+{
+    // Compute friendly for Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool Persistent           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+struct GemmConfigV4 : public GemmConfigBase
+{
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 32;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+struct GemmConfigV3_Wmma : public GemmConfigBase
+{
+    // Compute friendly for Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+using grouped_gemm_multi_d_kargs = ck_tile::GroupedGemmHostArgs<DsDataType::size()>;
+
+std::pair<bool, ck_tile::ArgParser> create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("Ms", "", "M dimensions - empty by default.")
+        .insert("Ns", "", "N dimensions - empty by default.")
+        .insert("Ks", "", "K dimensions - empty by default.")
+        .insert("stride_As", "", "Tensor A strides - it is empty by default.")
+        .insert("stride_Bs", "", "Tensor B strides - it is empty by default.")
+        .insert("stride_Ds", "", "Tensor Ds strides - it is empty by default.")
+        .insert("stride_Es", "", "Tensor E strides - it is empty by default.")
+        .insert("a_layout", "R", "A tensor data layout - Row by default.")
+        .insert("b_layout", "C", "B tensor data layout - Row by default.")
+        .insert("ds_layout", "R", "Ds tensor data layout - Row by default.")
+        .insert("e_layout", "R", "E tensor data layout - Row by default.")
+        .insert("validate", "1", "0. No validation, 1. Validation on CPU.")
+        .insert("prec", "fp16", "data type. fp16")
+        .insert("warmup", "10", "number of iterations before benchmark the kernel.")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel.")
+        .insert("group_count", "8", "group count.")
+        .insert("kbatch", "1", "kbatch for SplitK")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "grouped_gemm.json", "json file name to dump results");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_pair(result, arg_parser);
+}
+
+inline std::size_t get_workspace_size(const std::vector<grouped_gemm_multi_d_kargs>& gemm_descs)
+{
+    return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg<DsDataType::size()>);
+}
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename CDEElementWise>
+float grouped_gemm_multi_d(const std::vector<grouped_gemm_multi_d_kargs>& gemm_descs,
+                           const ck_tile::stream_config& s,
+                           void* kargs_ptr);
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_preshuffle.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_preshuffle.cpp
new file mode 100644
index 0000000000..4ce55e8e72
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm_preshuffle.cpp
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <memory>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "grouped_gemm.hpp"
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
+                   const ck_tile::stream_config& s,
+                   void* kargs_ptr)
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
+
+    using Traits              = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                                        GemmConfig::kPadN,
+                                                        GemmConfig::kPadK,
+                                                        ALayout,
+                                                        BLayout,
+                                                        CLayout,
+                                                        GemmConfig::NumWaveGroups>;
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 GemmConfig::Persistent,
+                                                                 GemmConfig::NumWaveGroups,
+                                                                 GemmConfig::Preshuffle>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain = gemm_descs[0].k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split = (gemm_descs[0].K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop =
+        // if preshuffle == true then num_loop is recalculated for each group in the kernel code
+        TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             CLayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
+        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKargs(gemm_descs);
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Kernel arguments not supported!");
+        }
+
+        const dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(gemm_descs);
+
+        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
+                                            kargs.data(),
+                                            get_workspace_size(gemm_descs),
+                                            hipMemcpyHostToDevice,
+                                            s.stream_id_));
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
+                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
+                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
+        }
+
+        ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
+                                       Kernel{},
+                                       grids,
+                                       blocks,
+                                       0,
+                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                                       gemm_descs.size()));
+
+        return ave_time;
+    };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(gemm_descs[0].k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+
+    return ave_time;
+}
+
+#include "run_grouped_gemm_example.inc"
+
+template <typename GemmConfig, typename PrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row   = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col   = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using Types = GemmTypeConfig<PrecType>;
+    // Specific type aliases for easy access
+    using ADataType   = typename Types::ADataType;
+    using BDataType   = typename Types::BDataType;
+    using AccDataType = typename Types::AccDataType;
+    using CDataType   = typename Types::CDataType;
+
+    // Preshuffle is supported only for A(Row major), B(column major) input matrices!
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+}
+template <template <typename PrecType> typename GemmConfig>
+int run_grouped_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+
+    const std::string a_layout  = arg_parser.get_str("a_layout");
+    const std::string b_layout  = arg_parser.get_str("b_layout");
+    const std::string data_type = arg_parser.get_str("prec");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, ck_tile::fp8_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type configuration.");
+    }
+}
+int main(int argc, char* argv[])
+{
+#if CK_TILE_USE_WMMA
+    return !run_grouped_gemm_example<GemmConfigPreshuffleDecode_Wmma>(argc, argv);
+#else
+    return !run_grouped_gemm_example<GemmConfigPreshuffleDecode>(argc, argv);
+#endif
+}
diff --git a/example/ck_tile/17_grouped_gemm/quant_grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/quant_grouped_gemm.cpp
new file mode 100644
index 0000000000..409bb173a1
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/quant_grouped_gemm.cpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <memory>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm_quant.hpp"
+#include "ck_tile/host.hpp"
+#include "quant_grouped_gemm.hpp"
+
+template <typename GemmConfig,
+          typename ALayout,
+          typename AQLayout,
+          typename BLayout,
+          typename BQLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename BQDataType,
+          typename AccDataType,
+          typename CDataType>
+float grouped_gemm_tileloop(const ck_tile::stream_config& s,
+                            const ck_tile::index_t num_groups,
+                            void* kargs_ptr)
+{
+    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
+    using TilePartitioner = ck_tile::
+        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+    constexpr ck_tile::QuantType QuantMode = ck_tile::QuantType::RowColQuant;
+    using GemmUniversalTraits              = ck_tile::TileGemmQuantTraits<GemmConfig::kPadM,
+                                                                          GemmConfig::kPadN,
+                                                                          GemmConfig::kPadK,
+                                                                          false,
+                                                                          ALayout,
+                                                                          BLayout,
+                                                                          CLayout,
+                                                                          QuantMode,
+                                                                          AQLayout,
+                                                                          BQLayout,
+                                                                          GemmConfig::DoubleSmemBuffer,
+                                                                          true>;
+
+    float ave_time{0};
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+        constexpr bool transpose_c      = false;
+
+        using QuantGemmProblem = ck_tile::GemmRowColTensorQuantPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               transpose_c,
+                                                                               BDataType,
+                                                                               scheduler>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<QuantGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             ck_tile::tuple<>,
+                                             AccDataType,
+                                             CDataType,
+                                             ck_tile::tuple<>,
+                                             CLayout,
+                                             ck_tile::element_wise::PassThrough,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             QuantGemmProblem::TransposeC,
+                                             memory_operation>>;
+        using Kernel      = ck_tile::QuantGroupedGemmKernel<TilePartitioner,
+                                                            GemmPipeline,
+                                                            GemmEpilogue,
+                                                            GemmUniversalTraits::kQuantType>;
+        const dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::MaxOccupancyGridSize(s);
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
+                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
+                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
+        }
+
+        ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
+                                       Kernel{},
+                                       grids,
+                                       blocks,
+                                       0,
+                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                                       num_groups));
+
+        return ave_time;
+    };
+
+    Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                   ck_tile::memory_operation_enum::set>{});
+
+    return ave_time;
+}
+
+#include "quant_run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[])
+{
+    return !run_grouped_gemm_example<GemmConfigComputeV3_2>(argc, argv);
+}
diff --git a/example/ck_tile/17_grouped_gemm/quant_grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/quant_grouped_gemm.hpp
new file mode 100644
index 0000000000..93e461b9d3
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/quant_grouped_gemm.hpp
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
+#endif
+
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(CK_GFX950_SUPPORT)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
+#endif
+}
+
+template <typename DataType>
+struct GemmTypeConfig;
+
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 1;
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+using grouped_gemm_kargs = ck_tile::QuantGroupedGemmHostArgs;
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("Ms", "", "M dimensions - empty by default.")
+        .insert("Ns", "", "N dimensions - empty by default.")
+        .insert("Ks", "", "K dimensions - empty by default.")
+        .insert("stride_As", "", "Tensor A strides - it is empty by default.")
+        .insert("stride_Bs", "", "Tensor B strides - it is empty by default.")
+        .insert("stride_Cs", "", "Tensor C strides - it is empty by default.")
+        .insert("stride_AQs", "", "Tensor AQ strides - it is empty by default.")
+        .insert("stride_BQs", "", "Tensor BQ strides - it is empty by default.")
+        .insert("a_layout", "R", "A tensor data layout - Row by default.")
+        .insert("b_layout", "C", "B tensor data layout - Row by default.")
+        .insert("c_layout", "R", "C tensor data layout - Row by default.")
+        .insert("validate", "1", "0. No validation, 1. Validation on CPU.")
+        .insert("prec", "fp8", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "10", "number of iterations before benchmark the kernel.")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel.")
+        .insert("group_count", "8", "group count.")
+        .insert("kbatch", "1", "kbatch for SplitK");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+inline std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
+{
+    return gemm_descs.size() * sizeof(ck_tile::QuantGemmTransKernelArg);
+}
+
+template <typename GemmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType>
+float grouped_gemm_tileloop(const ck_tile::stream_config& s,
+                            const ck_tile::index_t num_groups,
+                            void* kargs_ptr,
+                            bool splitk = false);
diff --git a/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc
new file mode 100644
index 0000000000..10d317a2c7
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc
@@ -0,0 +1,455 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename BQDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename AQLayout,
+          typename BLayout,
+          typename BQLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm(int n_warmup,
+                  int n_repeat,
+                  int group_count,
+                  const std::vector<grouped_gemm_kargs>& args)
+{
+    // Workspace memory allocated to hold the gemm descriptions.
+    ck_tile::DeviceMem gemm_workspace;
+    gemm_workspace.Realloc(get_workspace_size(args));
+
+    float ave_time = 0;
+
+    // NOTE: With the persistent TileLoop kernel, we do not necessarily need to have
+    // the gemm problems known on the host. Instead, we can just pass the pointer
+    // to the kernel and let the workgroups figure out which tiles to work on.
+    // This is useful when the gemm problems are generated dynamically.
+    // In this example however, we generate the `kargs` using the known gemm_descs,
+    // and copy the gemm descriptions to the device memory.
+    // The contents of the memory pointed to by `kargs_ptr` pointer could be
+    // written by e.g. another kernel from earlier stage.
+    std::vector<ck_tile::QuantGemmTransKernelArg> kargs;
+    void* kargs_ptr = gemm_workspace.GetDeviceBuffer();
+    assert(args[0].k_batch == 1);
+    for(const auto& arg : args)
+    {
+        kargs.emplace_back(ck_tile::QuantGroupedGemmKernelArgs{arg.a_ptr,
+                                                               arg.b_ptr,
+                                                               arg.aq_ptr,
+                                                               arg.bq_ptr,
+                                                               arg.e_ptr,
+                                                               arg.M,
+                                                               arg.N,
+                                                               arg.K,
+                                                               arg.QK_A,
+                                                               arg.QK_B,
+                                                               arg.stride_A,
+                                                               arg.stride_B,
+                                                               arg.stride_E,
+                                                               arg.stride_AQ,
+                                                               arg.stride_BQ,
+                                                               arg.k_batch});
+    }
+    const auto stream = ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat};
+    HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
+                                        kargs.data(),
+                                        kargs.size() * sizeof(ck_tile::QuantGemmTransKernelArg),
+                                        hipMemcpyHostToDevice,
+                                        stream.stream_id_));
+    ave_time = grouped_gemm_tileloop<GemmConfig,
+                                     ALayout,
+                                     AQLayout,
+                                     BLayout,
+                                     BQLayout,
+                                     CLayout,
+                                     ADataType,
+                                     AQDataType,
+                                     BDataType,
+                                     BQDataType,
+                                     AccDataType,
+                                     CDataType>(stream, group_count, kargs_ptr);
+
+    std::string op_name{"Grouped Gemm"};
+
+    std::size_t flop = 0, num_btype = 0;
+    for(int j = 0; j < group_count; ++j)
+    {
+        flop += std::size_t(2) * args[j].M * args[j].N * args[j].K;
+
+        num_btype += sizeof(ADataType) * args[j].M * args[j].K +
+                     sizeof(BDataType) * args[j].K * args[j].N +
+                     sizeof(CDataType) * args[j].M * args[j].N;
+    }
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+              << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename BQDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename AQLayout,
+          typename BLayout,
+          typename BQLayout,
+          typename CLayout>
+int run_grouped_gemm_example_with_layouts(int argc,
+                                          char* argv[],
+                                          const ALayout a_layout                  = ALayout{},
+                                          const AQLayout aq_layout                = AQLayout{},
+                                          const BLayout b_layout                  = BLayout{},
+                                          const BQLayout bq_layout                = BQLayout{},
+                                          [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+
+    if(!result)
+    {
+        return -1;
+    };
+
+    auto valid_input_data = [&](int group_count, const auto&... args) {
+        return !(args.empty() || ...) && group_count == (args.size() == ...);
+    };
+
+    const int group_count = arg_parser.get_int("group_count");
+    const int repeat      = arg_parser.get_int("repeat");
+    const int warmup      = arg_parser.get_int("warmup");
+    const int kbatch      = arg_parser.get_int("kbatch");
+    bool validate         = arg_parser.get_bool("validate");
+
+    if(kbatch > 1 && validate && warmup + repeat > 1)
+    {
+        std::cout << "WARNING: Data validation enabled with SplitK and more than"
+                  << "1 warmup/repeat. Disabling validation." << std::endl;
+        validate = false;
+    }
+
+    std::vector<ck_tile::index_t> Ms         = arg_parser.get_int_vec("Ms");
+    std::vector<ck_tile::index_t> Ns         = arg_parser.get_int_vec("Ns");
+    std::vector<ck_tile::index_t> Ks         = arg_parser.get_int_vec("Ks");
+    std::vector<ck_tile::index_t> stride_As  = arg_parser.get_int_vec("stride_As");
+    std::vector<ck_tile::index_t> stride_Bs  = arg_parser.get_int_vec("stride_Bs");
+    std::vector<ck_tile::index_t> stride_Cs  = arg_parser.get_int_vec("stride_Cs");
+    std::vector<ck_tile::index_t> stride_AQs = arg_parser.get_int_vec("stride_AQs");
+    std::vector<ck_tile::index_t> stride_BQs = arg_parser.get_int_vec("stride_BQs");
+
+    ck_tile::index_t AQK, BQK;
+
+    if(!valid_input_data(group_count, Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs))
+    {
+        std::cout << "Please check the input data. Default values will be used." << std::endl;
+
+        // Clear existing (invalid) data before adding defaults
+        Ms.clear();
+        Ns.clear();
+        Ks.clear();
+        stride_As.clear();
+        stride_Bs.clear();
+        stride_Cs.clear();
+        stride_AQs.clear();
+        stride_BQs.clear();
+
+        for(int i = 0; i < group_count; i++)
+        {
+            Ms.push_back(256 + 256 * i);
+            Ns.push_back(256 + 512 * i);
+            Ks.push_back(512 + 128 * i);
+
+            // Let get_default_stride calculate based on layout
+            stride_As.push_back(0);
+            stride_Bs.push_back(0);
+            stride_Cs.push_back(0);
+            stride_AQs.push_back(0);
+            stride_BQs.push_back(0);
+        }
+    }
+
+    std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
+    std::vector<ck_tile::HostTensor<BDataType>> b_k_n_tensors;
+    std::vector<ck_tile::HostTensor<CDataType>> c_m_n_tensors;
+    std::vector<ck_tile::HostTensor<AQDataType>> aq_tensors;
+    std::vector<ck_tile::HostTensor<BQDataType>> bq_tensors;
+
+    a_m_k_tensors.reserve(group_count);
+    b_k_n_tensors.reserve(group_count);
+    c_m_n_tensors.reserve(group_count);
+    aq_tensors.reserve(group_count);
+    bq_tensors.reserve(group_count);
+
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> a_m_k_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> b_k_n_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> c_m_n_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> aq_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> bq_dev_buf;
+
+    a_m_k_dev_buf.reserve(group_count);
+    b_k_n_dev_buf.reserve(group_count);
+    c_m_n_dev_buf.reserve(group_count);
+    aq_dev_buf.reserve(group_count);
+    bq_dev_buf.reserve(group_count);
+
+    std::vector<grouped_gemm_kargs> gemm_descs;
+    gemm_descs.reserve(group_count);
+
+    for(int i = 0; i < group_count; ++i)
+    {
+
+        const ck_tile::index_t M = Ms[i];
+        const ck_tile::index_t N = Ns[i];
+        const ck_tile::index_t K = Ks[i];
+
+        AQK = 1; // Row quantization: tensor shape [M, 1]. Only for NT
+        BQK = N; // Column quantization: tensor shape [1, N]. Only for NT
+
+        stride_As[i]  = ck_tile::get_default_stride(M, K, stride_As[i], is_row_major(a_layout));
+        stride_Bs[i]  = ck_tile::get_default_stride(K, N, stride_Bs[i], is_row_major(b_layout));
+        stride_Cs[i]  = ck_tile::get_default_stride(M, N, stride_Cs[i], is_row_major(CLayout{}));
+        stride_AQs[i] = ck_tile::get_default_stride(M, AQK, stride_AQs[i], is_row_major(aq_layout));
+        stride_BQs[i] = ck_tile::get_default_stride(1, N, stride_BQs[i], is_row_major(bq_layout));
+        a_m_k_tensors.push_back(ck_tile::HostTensor<ADataType>(
+            ck_tile::host_tensor_descriptor(M, K, stride_As[i], is_row_major(a_layout))));
+        b_k_n_tensors.push_back(ck_tile::HostTensor<BDataType>(
+            ck_tile::host_tensor_descriptor(K, N, stride_Bs[i], is_row_major(b_layout))));
+        c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>(
+            ck_tile::host_tensor_descriptor(M, N, stride_Cs[i], is_row_major(CLayout{}))));
+        aq_tensors.push_back(ck_tile::HostTensor<AQDataType>(
+            ck_tile::host_tensor_descriptor(M, AQK, stride_AQs[i], is_row_major(aq_layout))));
+        bq_tensors.push_back(ck_tile::HostTensor<BQDataType>(
+            ck_tile::host_tensor_descriptor(1, N, stride_BQs[i], is_row_major(bq_layout))));
+
+        std::cout << "gemm[" << i << "]" << " a_m_k: " << a_m_k_tensors[i].mDesc
+                  << " b_k_n: " << b_k_n_tensors[i].mDesc << " c_m_n: " << c_m_n_tensors[i].mDesc
+                  << " aq: " << aq_tensors[i].mDesc << " bq: " << bq_tensors[i].mDesc << std::endl;
+
+        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);
+        ck_tile::FillUniformDistribution<AQDataType>{-1.f, 1.f}(aq_tensors[i]);
+        ck_tile::FillUniformDistribution<BQDataType>{-1.f, 1.f}(bq_tensors[i]);
+
+        a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+            a_m_k_tensors[i].get_element_space_size_in_bytes()));
+        b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+            b_k_n_tensors[i].get_element_space_size_in_bytes()));
+        c_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+            c_m_n_tensors[i].get_element_space_size_in_bytes()));
+        aq_dev_buf.push_back(
+            std::make_unique<ck_tile::DeviceMem>(aq_tensors[i].get_element_space_size_in_bytes()));
+        bq_dev_buf.push_back(
+            std::make_unique<ck_tile::DeviceMem>(bq_tensors[i].get_element_space_size_in_bytes()));
+
+        a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data());
+        b_k_n_dev_buf[i]->ToDevice(b_k_n_tensors[i].data());
+        aq_dev_buf[i]->ToDevice(aq_tensors[i].data());
+        bq_dev_buf[i]->ToDevice(bq_tensors[i].data());
+        c_m_n_dev_buf[i]->SetZero();
+        c_m_n_tensors[i].SetZero();
+
+        const void* p_a  = a_m_k_dev_buf[i]->GetDeviceBuffer();
+        const void* p_b  = b_k_n_dev_buf[i]->GetDeviceBuffer();
+        void* p_c        = c_m_n_dev_buf[i]->GetDeviceBuffer();
+        const void* p_aq = aq_dev_buf[i]->GetDeviceBuffer();
+        const void* p_bq = bq_dev_buf[i]->GetDeviceBuffer();
+
+        gemm_descs.push_back({p_a,
+                              p_b,
+                              p_c,
+                              p_aq,
+                              p_bq,
+                              kbatch,
+                              M,
+                              N,
+                              K,
+                              AQK,
+                              BQK,
+                              stride_As[i],
+                              stride_Bs[i],
+                              stride_Cs[i],
+                              stride_AQs[i],
+                              stride_BQs[i]});
+    }
+
+    invoke_gemm<GemmConfig,
+                ADataType,
+                AQDataType,
+                BDataType,
+                BQDataType,
+                AccDataType,
+                CDataType,
+                ALayout,
+                AQLayout,
+                BLayout,
+                BQLayout,
+                CLayout>(warmup, repeat, group_count, gemm_descs);
+
+    for(int i = 0; i < group_count; i++)
+    {
+        c_m_n_dev_buf[i]->FromDevice(c_m_n_tensors[i].data());
+    }
+
+    bool pass{true};
+    if(validate)
+    {
+        for(int i = 0; i < group_count; ++i)
+        {
+            ck_tile::HostTensor<CDataType> c_m_n_host_ref(ck_tile::host_tensor_descriptor(
+                Ms[i], Ns[i], stride_Cs[i], is_row_major(CLayout{})));
+            c_m_n_host_ref.SetZero();
+            ck_tile::reference_gemm_rowcol_quant<ADataType,
+                                                 AQDataType,
+                                                 BDataType,
+                                                 BQDataType,
+                                                 AccDataType,
+                                                 CDataType>(
+                a_m_k_tensors[i], aq_tensors[i], b_k_n_tensors[i], bq_tensors[i], c_m_n_host_ref);
+            const float max_accumulated_value =
+                *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+            const auto rtol_atol =
+                calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                    Ks[i], kbatch, max_accumulated_value);
+            pass &= ck_tile::check_err(c_m_n_tensors[i],
+                                       c_m_n_host_ref,
+                                       "Error: Incorrect results!",
+                                       rtol_atol.at(ck_tile::number<0>{}),
+                                       rtol_atol.at(ck_tile::number<1>{}));
+            std::cout << "gemm[" << i
+                      << "] Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
+
+template <typename GemmConfig, typename PrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row   = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col   = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using Types = GemmTypeConfig<PrecType>;
+    // Specific type aliases for easy access
+    using ADataType   = typename Types::ADataType;
+    using BDataType   = typename Types::BDataType;
+    using AccDataType = typename Types::AccDataType;
+    using CDataType   = typename Types::CDataType;
+    using AQDataType  = typename Types::AccDataType;
+    using BQDataType  = typename Types::AccDataType;
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<GemmConfig,
+                                                     ADataType,
+                                                     AQDataType,
+                                                     BDataType,
+                                                     BQDataType,
+                                                     CDataType,
+                                                     AccDataType>(
+            argc, argv, Row{}, Row{}, Col{}, Col{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<GemmConfig,
+                                                     ADataType,
+                                                     AQDataType,
+                                                     BDataType,
+                                                     BQDataType,
+                                                     CDataType,
+                                                     AccDataType>(
+            argc, argv, Row{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<GemmConfig,
+                                                     ADataType,
+                                                     AQDataType,
+                                                     BDataType,
+                                                     BQDataType,
+                                                     CDataType,
+                                                     AccDataType>(
+            argc, argv, Row{}, Row{}, Col{}, Col{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<GemmConfig,
+                                                     ADataType,
+                                                     AQDataType,
+                                                     BDataType,
+                                                     BQDataType,
+                                                     CDataType,
+                                                     AccDataType>(
+            argc, argv, Col{}, Col{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
+
+template <template <typename PrecType> typename GemmConfig>
+int run_grouped_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+
+    const std::string a_layout  = arg_parser.get_str("a_layout");
+    const std::string b_layout  = arg_parser.get_str("b_layout");
+    const std::string data_type = arg_parser.get_str("prec");
+
+    if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, ck_tile::fp8_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type configuration.");
+    }
+}
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index 425299203f..f822c7d8a7 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -2,7 +2,6 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
-
 template <typename Layout>
 static constexpr inline auto is_row_major(Layout layout_)
 {
@@ -41,7 +40,6 @@ template <typename GemmConfig,
           typename BLayout,
           typename DsLayout,
           typename CLayout,
-          bool Persistent,
           typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_gemm(int n_warmup,
                   int n_repeat,
@@ -53,10 +51,10 @@ float invoke_gemm(int n_warmup,
     gemm_workspace.Realloc(get_workspace_size(args));
 
     float ave_time = 0;
-    if constexpr(!Persistent)
+    if constexpr(!GemmConfig::Persistent)
     {
-        // Regular version of grouped gemm
-        ave_time = grouped_gemm<ADataType,
+        ave_time = grouped_gemm<GemmConfig,
+                                ADataType,
                                 BDataType,
                                 DsDataType,
                                 AccDataType,
@@ -72,15 +70,25 @@ float invoke_gemm(int n_warmup,
     }
     else
     {
-        // NOTE: With the persistent TileLoop kernel, we do not necessarily need to have
-        // the gemm problems known on the host. Instead, we can just pass the pointer
-        // to the kernel and let the workgroups figure out which tiles to work on.
-        // This is useful when the gemm problems are generated dynamically.
+        if(GemmConfig::Preshuffle)
+        {
+            // not supported yet
+            throw std::runtime_error(
+                "Persistent grouped gemm with preshuffle is not supported yet");
+        }
+
+        // NOTE: With the persistent TileLoop kernel, we do not necessarily need to haveCollapse
+        // commentComment on line L74tenpercent commented on Sep 5, 2025 tenpercenton Sep 5,
+        // 2025ContributorMore actionsdid you intend to remove the comment?Write a replyResolve
+        // commentCode has comments. Press enter to view. the gemm problems known on the host.
+        // Instead, we can just pass the pointer to the kernel and let the workgroups figure out
+        // which tiles to work on. This is useful when the gemm problems are generated dynamically.
         // In this example however, we generate the `kargs` using the known gemm_descs,
         // and copy the gemm descriptions to the device memory.
         // The contents of the memory pointed to by `kargs_ptr` pointer could be
         // written by e.g. another kernel from earlier stage.
-        std::vector<ck_tile::GemmTransKernelArg> kargs;
+
+        std::vector<ck_tile::GemmTransKernelArg<>> kargs;
         void* kargs_ptr   = gemm_workspace.GetDeviceBuffer();
         const bool splitk = args[0].k_batch > 1;
         for(const auto& arg : args)
@@ -101,7 +109,7 @@ float invoke_gemm(int n_warmup,
         const auto stream = ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat};
         HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
                                             kargs.data(),
-                                            kargs.size() * sizeof(ck_tile::GemmTransKernelArg),
+                                            kargs.size() * sizeof(ck_tile::GemmTransKernelArg<>),
                                             hipMemcpyHostToDevice,
                                             stream.stream_id_));
         ave_time = grouped_gemm_tileloop<GemmConfig,
@@ -114,29 +122,10 @@ float invoke_gemm(int n_warmup,
                                          CDataType>(stream, group_count, kargs_ptr, splitk);
     }
 
-    std::string op_name{"Grouped Gemm"};
-
-    std::size_t flop = 0, num_btype = 0;
-    for(int j = 0; j < group_count; ++j)
-    {
-        flop += std::size_t(2) * args[j].M * args[j].N * args[j].K;
-
-        num_btype += sizeof(ADataType) * args[j].M * args[j].K +
-                     sizeof(BDataType) * args[j].K * args[j].N +
-                     sizeof(CDataType) * args[j].M * args[j].N;
-    }
-
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-              << gb_per_sec << " GB/s, " << op_name << std::endl;
-
     return ave_time;
 }
 
-template <bool Persistent,
-          typename GemmConfig,
+template <typename GemmConfig,
           typename ADataType,
           typename BDataType,
           typename CDataType,
@@ -150,12 +139,8 @@ int run_grouped_gemm_example_with_layouts(int argc,
                                           const BLayout b_layout                  = BLayout{},
                                           [[maybe_unused]] const CLayout c_layout = CLayout{})
 {
-    auto [result, arg_parser] = create_args(argc, argv);
 
-    if(!result)
-    {
-        return -1;
-    };
+    auto [result, arg_parser] = create_args(argc, argv);
 
     auto valid_input_data = [&](int group_count, const auto&... args) {
         return !(args.empty() || ...) && group_count == (args.size() == ...);
@@ -184,15 +169,28 @@ int run_grouped_gemm_example_with_layouts(int argc,
     if(!valid_input_data(group_count, Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs))
     {
         std::cout << "Please check the input data. Default values will be used." << std::endl;
+        std::cout << "Default values: Ms (256, 512, 768, 1024..), Ns (256, 768, 1280..), Ks (512, "
+                     "896, 1280..)"
+                  << std::endl;
+
+        // Clear existing (invalid) data before adding defaults
+        Ms.clear();
+        Ns.clear();
+        Ks.clear();
+        stride_As.clear();
+        stride_Bs.clear();
+        stride_Cs.clear();
+
         for(int i = 0; i < group_count; i++)
         {
             Ms.push_back(256 + 256 * i);
             Ns.push_back(256 + 512 * i);
-            Ks.push_back(512 + 128 * i);
+            Ks.push_back(512 + 384 * i);
 
-            stride_As.push_back(Ks[i]);
-            stride_Bs.push_back(Ks[i]);
-            stride_Cs.push_back(Ns[i]);
+            // Set default strides based on layout later using get_default_stride
+            stride_As.push_back(0);
+            stride_Bs.push_back(0);
+            stride_Cs.push_back(0);
         }
     }
 
@@ -217,11 +215,12 @@ int run_grouped_gemm_example_with_layouts(int argc,
 
     for(int i = 0; i < group_count; ++i)
     {
+
         const ck_tile::index_t M = Ms[i];
         const ck_tile::index_t N = Ns[i];
         const ck_tile::index_t K = Ks[i];
 
-        stride_As[i] = ck_tile::get_default_stride(M, N, stride_As[i], is_row_major(a_layout));
+        stride_As[i] = ck_tile::get_default_stride(M, K, stride_As[i], is_row_major(a_layout));
         stride_Bs[i] = ck_tile::get_default_stride(K, N, stride_Bs[i], is_row_major(b_layout));
         stride_Cs[i] = ck_tile::get_default_stride(M, N, stride_Cs[i], is_row_major(CLayout{}));
 
@@ -239,15 +238,21 @@ int run_grouped_gemm_example_with_layouts(int argc,
         ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
         ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);
 
-        a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
-            a_m_k_tensors[i].get_element_space_size_in_bytes()));
-        b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
-            b_k_n_tensors[i].get_element_space_size_in_bytes()));
-        c_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
-            c_m_n_tensors[i].get_element_space_size_in_bytes()));
+        a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(a_m_k_tensors[i]));
+
+        // Perform preshuffle for B tensor
+        if constexpr(GemmConfig::Preshuffle)
+        {
+            ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n_tensors[i]);
+            b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(b_shuffle_host));
+        }
+        else
+        {
+            b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(b_k_n_tensors[i]));
+        }
+
+        c_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(c_m_n_tensors[i]));
 
-        a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data());
-        b_k_n_dev_buf[i]->ToDevice(b_k_n_tensors[i].data());
         c_m_n_dev_buf[i]->SetZero();
         c_m_n_tensors[i].SetZero();
 
@@ -255,21 +260,48 @@ int run_grouped_gemm_example_with_layouts(int argc,
         const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
         void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
 
-        gemm_descs.push_back(
-            {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+        gemm_descs.push_back({p_a,
+                              p_b,
+                              {/*ds_ptr*/},
+                              p_c,
+                              kbatch,
+                              M,
+                              N,
+                              K,
+                              stride_As[i],
+                              stride_Bs[i],
+                              {/*stride_Ds*/},
+                              stride_Cs[i]});
     }
 
-    invoke_gemm<GemmConfig,
-                ADataType,
-                BDataType,
-                ck_tile::tuple<>,
-                AccDataType,
-                CDataType,
-                ALayout,
-                BLayout,
-                ck_tile::tuple<>,
-                CLayout,
-                Persistent>(warmup, repeat, group_count, gemm_descs);
+    float ave_time = invoke_gemm<GemmConfig,
+                                 ADataType,
+                                 BDataType,
+                                 ck_tile::tuple<>,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 ck_tile::tuple<>,
+                                 CLayout>(warmup, repeat, group_count, gemm_descs);
+
+    std::string op_name{"Grouped Gemm"};
+
+    std::size_t flop = 0, num_btype = 0;
+    for(int j = 0; j < group_count; ++j)
+    {
+        flop += std::size_t(2) * gemm_descs[j].M * gemm_descs[j].N * gemm_descs[j].K;
+
+        num_btype += sizeof(ADataType) * gemm_descs[j].M * gemm_descs[j].K +
+                     sizeof(BDataType) * gemm_descs[j].K * gemm_descs[j].N +
+                     sizeof(CDataType) * gemm_descs[j].M * gemm_descs[j].N;
+    }
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+              << gb_per_sec << " GB/s, " << op_name << std::endl;
 
     for(int i = 0; i < group_count; i++)
     {
@@ -291,11 +323,12 @@ int run_grouped_gemm_example_with_layouts(int argc,
             const auto rtol_atol =
                 calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
                     Ks[i], kbatch, max_accumulated_value);
-            pass &= ck_tile::check_err(c_m_n_tensors[i],
-                                       c_m_n_host_ref,
-                                       "Error: Incorrect results!",
-                                       rtol_atol.at(ck_tile::number<0>{}),
-                                       rtol_atol.at(ck_tile::number<1>{}));
+            pass &=
+                ck_tile::check_err(c_m_n_tensors[i],
+                                   c_m_n_host_ref,
+                                   "Error: Incorrect results! in group [" + std::to_string(i) + "]",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
             std::cout << "gemm[" << i
                       << "] Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
                       << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
@@ -304,88 +337,16 @@ int run_grouped_gemm_example_with_layouts(int argc,
         std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_grouped_gemm_json_results<ALayout, BLayout, CLayout>(arg_parser.get_str("jsonfile"),
+                                                                  op_name,
+                                                                  group_count,
+                                                                  pass,
+                                                                  ave_time,
+                                                                  tflops,
+                                                                  gb_per_sec);
+    }
+
     return pass;
 }
-
-template <bool Persistent, typename GemmConfig, typename PrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
-{
-    using Row   = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col   = ck_tile::tensor_layout::gemm::ColumnMajor;
-    using Types = GemmTypeConfig<PrecType>;
-    // Specific type aliases for easy access
-    using ADataType   = typename Types::ADataType;
-    using BDataType   = typename Types::BDataType;
-    using AccDataType = typename Types::AccDataType;
-    using CDataType   = typename Types::CDataType;
-
-    if(a_layout == "R" && b_layout == "C")
-    {
-        return run_grouped_gemm_example_with_layouts<Persistent,
-                                                     GemmConfig,
-                                                     ADataType,
-                                                     BDataType,
-                                                     CDataType,
-                                                     AccDataType>(argc, argv, Row{}, Col{}, Row{});
-    }
-    else if(a_layout == "R" && b_layout == "R")
-    {
-        return run_grouped_gemm_example_with_layouts<Persistent,
-                                                     GemmConfig,
-                                                     ADataType,
-                                                     BDataType,
-                                                     CDataType,
-                                                     AccDataType>(argc, argv, Row{}, Row{}, Row{});
-    }
-    else if(a_layout == "C" && b_layout == "R")
-    {
-        return run_grouped_gemm_example_with_layouts<Persistent,
-                                                     GemmConfig,
-                                                     ADataType,
-                                                     BDataType,
-                                                     CDataType,
-                                                     AccDataType>(argc, argv, Col{}, Row{}, Row{});
-    }
-    else if(a_layout == "C" && b_layout == "C")
-    {
-        return run_grouped_gemm_example_with_layouts<Persistent,
-                                                     GemmConfig,
-                                                     ADataType,
-                                                     BDataType,
-                                                     CDataType,
-                                                     AccDataType>(argc, argv, Col{}, Col{}, Row{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
-    }
-}
-
-template <bool Persistent, template <typename PrecType> typename GemmConfig>
-int run_grouped_gemm_example(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-    {
-        return -1;
-    }
-
-    const std::string a_layout  = arg_parser.get_str("a_layout");
-    const std::string b_layout  = arg_parser.get_str("b_layout");
-    const std::string data_type = arg_parser.get_str("prec");
-
-    if(data_type == "fp16")
-    {
-        return run_gemm_example_prec_type<Persistent, GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "fp8")
-    {
-        return run_gemm_example_prec_type<Persistent, GemmConfig<ck_tile::fp8_t>, ck_tile::fp8_t>(
-            a_layout, b_layout, argc, argv);
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data type configuration.");
-    }
-}
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_multi_d_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_multi_d_example.inc
new file mode 100644
index 0000000000..8f275b069b
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_multi_d_example.inc
@@ -0,0 +1,417 @@
+#pragma once
+
+struct MultiplyMultiply
+{
+    template <typename E, typename C, typename D0, typename D1>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const D0& d0, const D1& d1) const -> void
+    {
+        const float x0_f = ck_tile::type_convert<float>(c) * ck_tile::type_convert<float>(d0) *
+                           ck_tile::type_convert<float>(d1);
+
+        e = ck_tile::type_convert<E>(x0_f);
+    }
+};
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+
+    const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+        max_accumulated_value, kbatch);
+
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename CDEElementWise>
+float invoke_gemm(int n_warmup,
+                  int n_repeat,
+                  int group_count,
+                  const std::vector<grouped_gemm_multi_d_kargs>& args)
+{
+    // Workspace memory allocated to hold the gemm descriptions.
+    ck_tile::DeviceMem gemm_workspace;
+    gemm_workspace.Realloc(get_workspace_size(args));
+
+    float ave_time = 0;
+    if constexpr(!GemmConfig::Persistent)
+    {
+        ave_time = grouped_gemm_multi_d<GemmConfig,
+                                        ADataType,
+                                        BDataType,
+                                        DsDataType,
+                                        AccDataType,
+                                        EDataType,
+                                        ALayout,
+                                        BLayout,
+                                        DsLayout,
+                                        ELayout,
+                                        CDEElementWise>(
+            args,
+            ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat},
+            gemm_workspace.GetDeviceBuffer());
+    }
+    else
+    {
+        std::vector<ck_tile::GemmTransKernelArg<DsDataType::size()>> kargs;
+        void* kargs_ptr   = gemm_workspace.GetDeviceBuffer();
+        const bool splitk = args[0].k_batch > 1;
+        for(const auto& arg : args)
+        {
+            kargs.emplace_back(ck_tile::UniversalGemmKernelArgs<1, 1, 2>{{arg.a_ptr},
+                                                                         {arg.b_ptr},
+                                                                         arg.ds_ptr,
+                                                                         arg.e_ptr,
+                                                                         arg.M,
+                                                                         arg.N,
+                                                                         arg.K,
+                                                                         {arg.stride_A},
+                                                                         {arg.stride_B},
+                                                                         arg.stride_Ds,
+                                                                         arg.stride_E,
+                                                                         arg.k_batch});
+        }
+        const auto stream = ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat};
+        HIP_CHECK_ERROR(hipMemcpyWithStream(
+            kargs_ptr,
+            kargs.data(),
+            kargs.size() * sizeof(ck_tile::GemmTransKernelArg<DsDataType::size()>),
+            hipMemcpyHostToDevice,
+            stream.stream_id_));
+        ave_time =
+            grouped_gemm_multi_d_tileloop<GemmConfig,
+                                          ADataType,
+                                          BDataType,
+                                          DsDataType,
+                                          AccDataType,
+                                          EDataType,
+                                          ALayout,
+                                          BLayout,
+                                          DsLayout,
+                                          ELayout,
+                                          CDEElementWise>(stream, group_count, kargs_ptr, splitk);
+    }
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout>
+int run_grouped_gemm_multi_d_example_with_layouts(int argc,
+                                                  char* argv[],
+                                                  const ALayout a_layout   = ALayout{},
+                                                  const BLayout b_layout   = BLayout{},
+                                                  const D0Layout d0_layout = D0Layout{},
+                                                  const D1Layout d1_layout = D1Layout{},
+                                                  const ELayout e_layout   = ELayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+
+    using CDElementWise = MultiplyMultiply;
+    using DsLayout      = ck_tile::tuple<D0Layout, D1Layout>;
+
+    auto valid_input_data = [&](int group_count, const auto&... args) {
+        return !(args.empty() || ...) && group_count == (args.size() == ...);
+    };
+
+    const int group_count = arg_parser.get_int("group_count");
+    const int repeat      = arg_parser.get_int("repeat");
+    const int warmup      = arg_parser.get_int("warmup");
+    const int kbatch      = arg_parser.get_int("kbatch");
+    bool validate         = arg_parser.get_bool("validate");
+
+    if(kbatch > 1 && validate && warmup + repeat > 1)
+    {
+        std::cout << "WARNING: Data validation enabled with SplitK and more than"
+                  << "1 warmup/repeat. Disabling validation." << std::endl;
+        validate = false;
+    }
+
+    std::vector<ck_tile::index_t> Ms        = arg_parser.get_int_vec("Ms");
+    std::vector<ck_tile::index_t> Ns        = arg_parser.get_int_vec("Ns");
+    std::vector<ck_tile::index_t> Ks        = arg_parser.get_int_vec("Ks");
+    std::vector<ck_tile::index_t> stride_As = arg_parser.get_int_vec("stride_As");
+    std::vector<ck_tile::index_t> stride_Bs = arg_parser.get_int_vec("stride_Bs");
+    std::vector<ck_tile::index_t> stride_D0 = arg_parser.get_int_vec("stride_Ds");
+    std::vector<ck_tile::index_t> stride_D1 = arg_parser.get_int_vec("stride_Ds");
+    std::vector<ck_tile::index_t> stride_Es = arg_parser.get_int_vec("stride_Es");
+
+    if(!valid_input_data(
+           group_count, Ms, Ns, Ks, stride_As, stride_Bs, stride_D0, stride_D1, stride_Es))
+    {
+        std::cout << "Please check the input data. Default values will be used." << std::endl;
+        std::cout << "Default values: Ms (256, 512, 768, 1024..), Ns (256, 768, 1280..), Ks (512, "
+                     "896, 1280..), stride_As (Ks), stride_Bs (Ks), stride_D0 (Ns), stride_D1 "
+                     "(Ns), stride_Es (Ns)"
+                  << std::endl;
+        for(int i = 0; i < group_count; i++)
+        {
+            Ms.push_back(256 /* + 256 * i */);
+            Ns.push_back(256 /* + 512 * i */);
+            Ks.push_back(64 /* + 384 * i */);
+
+            stride_As.push_back(Ks[i]);
+            stride_Bs.push_back(Ks[i]);
+            stride_D0.push_back(Ns[i]);
+            stride_D1.push_back(Ns[i]);
+            stride_Es.push_back(Ns[i]);
+        }
+    }
+
+    std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
+    std::vector<ck_tile::HostTensor<BDataType>> b_k_n_tensors;
+    std::vector<ck_tile::HostTensor<D0DataType>> d0_m_n_tensors;
+    std::vector<ck_tile::HostTensor<D1DataType>> d1_m_n_tensors;
+    std::vector<ck_tile::HostTensor<EDataType>> e_m_n_tensors;
+
+    a_m_k_tensors.reserve(group_count);
+    b_k_n_tensors.reserve(group_count);
+    d0_m_n_tensors.reserve(group_count);
+    d1_m_n_tensors.reserve(group_count);
+    e_m_n_tensors.reserve(group_count);
+
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> a_m_k_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> b_k_n_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> d0_m_n_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> d1_m_n_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> e_m_n_dev_buf;
+
+    a_m_k_dev_buf.reserve(group_count);
+    b_k_n_dev_buf.reserve(group_count);
+    d0_m_n_dev_buf.reserve(group_count);
+    d1_m_n_dev_buf.reserve(group_count);
+    e_m_n_dev_buf.reserve(group_count);
+
+    std::vector<grouped_gemm_multi_d_kargs> gemm_descs;
+    gemm_descs.reserve(group_count);
+
+    for(int i = 0; i < group_count; ++i)
+    {
+
+        const ck_tile::index_t M = Ms[i];
+        const ck_tile::index_t N = Ns[i];
+        const ck_tile::index_t K = Ks[i];
+
+        stride_As[i] = ck_tile::get_default_stride(M, K, stride_As[i], is_row_major(a_layout));
+        stride_Bs[i] = ck_tile::get_default_stride(K, N, stride_Bs[i], is_row_major(b_layout));
+
+        stride_D0[i] = ck_tile::get_default_stride(M, N, stride_D0[i], is_row_major(d0_layout));
+        stride_D1[i] = ck_tile::get_default_stride(M, N, stride_D1[i], is_row_major(d1_layout));
+
+        stride_Es[i] = ck_tile::get_default_stride(M, N, stride_Es[i], is_row_major(e_layout));
+
+        a_m_k_tensors.push_back(ck_tile::HostTensor<ADataType>(
+            ck_tile::host_tensor_descriptor(M, K, stride_As[i], is_row_major(a_layout))));
+        b_k_n_tensors.push_back(ck_tile::HostTensor<BDataType>(
+            ck_tile::host_tensor_descriptor(K, N, stride_Bs[i], is_row_major(b_layout))));
+
+        d0_m_n_tensors.push_back(ck_tile::HostTensor<D0DataType>(
+            ck_tile::host_tensor_descriptor(M, N, stride_D0[i], is_row_major(d0_layout))));
+        d1_m_n_tensors.push_back(ck_tile::HostTensor<D1DataType>(
+            ck_tile::host_tensor_descriptor(M, N, stride_D1[i], is_row_major(d1_layout))));
+
+        e_m_n_tensors.push_back(ck_tile::HostTensor<EDataType>(
+            ck_tile::host_tensor_descriptor(M, N, stride_Es[i], is_row_major(e_layout))));
+
+        std::cout << "gemm[" << i << "]" << " a_m_k: " << a_m_k_tensors[i].mDesc
+                  << " b_k_n: " << b_k_n_tensors[i].mDesc << " d0_m_n: " << d0_m_n_tensors[i].mDesc
+                  << " d1_m_n: " << d1_m_n_tensors[i].mDesc << " e_m_n: " << e_m_n_tensors[i].mDesc
+                  << std::endl;
+
+        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);
+        ck_tile::FillUniformDistribution<D0DataType>{2.f, -2.f}(d0_m_n_tensors[i]);
+        ck_tile::FillUniformDistribution<D1DataType>{2.f, -2.f}(d1_m_n_tensors[i]);
+
+        a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(a_m_k_tensors[i]));
+
+        b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(b_k_n_tensors[i]));
+
+        d0_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(d0_m_n_tensors[i]));
+        d1_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(d1_m_n_tensors[i]));
+        e_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(e_m_n_tensors[i]));
+
+        e_m_n_dev_buf[i]->SetZero();
+
+        const void* p_a = a_m_k_dev_buf[i]->GetDeviceBuffer();
+        const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
+        void* p_e       = e_m_n_dev_buf[i]->GetDeviceBuffer();
+
+        std::array<const void*, DsDataType::size()> ds_ptr_buf = {
+            d0_m_n_dev_buf[i]->GetDeviceBuffer(), d1_m_n_dev_buf[i]->GetDeviceBuffer()};
+        std::array<ck_tile::index_t, DsDataType::size()> stridesDs = {stride_D0[i], stride_D1[i]};
+
+        gemm_descs.push_back({p_a,
+                              p_b,
+                              ds_ptr_buf,
+                              p_e,
+                              kbatch,
+                              M,
+                              N,
+                              K,
+                              stride_As[i],
+                              stride_Bs[i],
+                              stridesDs,
+                              stride_Es[i]});
+    }
+
+    float ave_time = invoke_gemm<GemmConfig,
+                                 ADataType,
+                                 BDataType,
+                                 DsDataType,
+                                 AccDataType,
+                                 EDataType,
+                                 ALayout,
+                                 BLayout,
+                                 DsLayout,
+                                 ELayout,
+                                 CDElementWise>(warmup, repeat, group_count, gemm_descs);
+
+    std::string op_name{"Grouped Gemm Multiple-D"};
+
+    std::size_t flop = 0, num_btype = 0;
+    for(int j = 0; j < group_count; ++j)
+    {
+        flop += std::size_t(2) * gemm_descs[j].M * gemm_descs[j].N * gemm_descs[j].K;
+        ck_tile::static_for<0, DsDataType::size(), 1>{}([&](auto i) {
+            num_btype += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
+                         gemm_descs[j].M * gemm_descs[j].N;
+            flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
+                    gemm_descs[j].M * gemm_descs[j].N;
+        });
+
+        num_btype += sizeof(ADataType) * gemm_descs[j].M * gemm_descs[j].K +
+                     sizeof(BDataType) * gemm_descs[j].K * gemm_descs[j].N +
+                     sizeof(EDataType) * gemm_descs[j].M * gemm_descs[j].N;
+    }
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+              << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+    std::vector<ck_tile::HostTensor<EDataType>> e_m_n_host_refs;
+    e_m_n_host_refs.reserve(group_count);
+
+    // copy e_m_n_tensors result from device to host and initialize host tensors to zero
+    for(int i = 0; i < group_count; i++)
+    {
+        e_m_n_dev_buf[i]->FromDevice(e_m_n_tensors[i].data());
+    }
+
+    bool pass{true};
+    if(validate)
+    {
+        for(int i = 0; i < group_count; ++i)
+        {
+            e_m_n_host_refs.push_back(ck_tile::HostTensor<EDataType>(
+                host_tensor_descriptor(Ms[i], Ns[i], stride_Es[i], is_row_major(e_layout))));
+
+            e_m_n_host_refs[i].SetZero();
+
+            ck_tile::reference_gemm_multiple_d<ADataType,
+                                               BDataType,
+                                               DsDataType,
+                                               AccDataType,
+                                               EDataType,
+                                               CDElementWise>(
+                a_m_k_tensors[i],
+                b_k_n_tensors[i],
+                {d0_m_n_tensors[i], d1_m_n_tensors[i]},
+                e_m_n_host_refs[i]);
+
+            const float max_accumulated_value =
+                *std::max_element(e_m_n_host_refs[i].mData.begin(), e_m_n_host_refs[i].mData.end());
+
+            const auto rtol_atol = calculate_rtol_atol(Ks[i], 1, max_accumulated_value);
+
+            pass &=
+                ck_tile::check_err(e_m_n_tensors[i],
+                                   e_m_n_host_refs[i],
+                                   "Error: Incorrect results! in group [" + std::to_string(i) + "]",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+            std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+        std::cout << "The CPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_grouped_gemm_json_results<ALayout, BLayout, ELayout>(arg_parser.get_str("jsonfile"),
+                                                                  op_name,
+                                                                  group_count,
+                                                                  pass,
+                                                                  ave_time,
+                                                                  tflops,
+                                                                  gb_per_sec);
+    }
+
+    return pass;
+}
+
+template <typename GemmConfig>
+int run_grouped_gemm_multi_d_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+    const std::string a_layout  = arg_parser.get_str("a_layout");
+    const std::string b_layout  = arg_parser.get_str("b_layout");
+    const std::string ds_layout = arg_parser.get_str("ds_layout");
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if(a_layout == "R" && b_layout == "C" && ds_layout == "R")
+    {
+        return run_grouped_gemm_multi_d_example_with_layouts<GemmConfig>(
+            argc, argv, Row{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for provided tensors!");
+    }
+}
diff --git a/example/ck_tile/18_flatmm/README.md b/example/ck_tile/18_flatmm/README.md
index eeaa7658bd..c58700fc7b 100644
--- a/example/ck_tile/18_flatmm/README.md
+++ b/example/ck_tile/18_flatmm/README.md
@@ -16,20 +16,23 @@ This will result in an executable `build/bin/tile_example_flatmm_basic`
 ## example
 ```
 args:
-          -b    batch size (default:1)
-          -m    m dimension (default:1024)
-          -n    n dimension (default:2048)
-          -k    k dimension (default:64)
-   -a_layout    Tensor A data layout (default: R)
-   -b_layout    Tensor B data layout (default: R)
-   -c_layout    Tensor C data layout (default: R)
+          -m    m dimension (default:256)
+          -n    n dimension (default:256)
+          -k    k dimension (default:128)
+   -a_layout    A tensor data layout - Row by default (default:R)
+   -b_layout    B tensor data layout - Row by default (default:C)
+   -c_layout    C tensor data layout - Row by default (default:R)
    -stride_a    Tensor A stride (default:0)
    -stride_b    Tensor B stride (default:0)
    -stride_c    Tensor C stride (default:0)
-          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
-          -e    Absolute error tolerance (default:1e-5)
+          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:1)
        -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
-     -warmup    number of iterations before benchmark the kernel (default:10)
+     -warmup    number of iterations before benchmark the kernel (default:50)
      -repeat    number of iterations to benchmark the kernel (default:100)
       -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+    -split_k    splitK value (default:1)
+       -init    0:random, 1:linear, 2:constant(1) (default:0)
+  -warp_tile    0: 16x16, 1: 32x32, 2: 16x16x128 (950 only), 3: 32x32x64 (950 only) (default:0)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:flatmm_basic.json)
 ```
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 93117e5b75..280da8d333 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -268,6 +268,9 @@ int main(int argc, char* argv[])
 
     try
     {
+#if defined(CK_TILE_USE_WMMA)
+        return !run_flatmm_example<FlatmmConfig16_Wmma>(argc, argv);
+#else
         int warp_tile = arg_parser.get_int("warp_tile");
         if(warp_tile == 0)
         {
@@ -285,6 +288,7 @@ int main(int argc, char* argv[])
         {
             return !run_flatmm_example<FlatmmConfig32_950>(argc, argv);
         }
+#endif
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
index 963a6ba675..8f8f65e214 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -86,6 +86,14 @@ struct FlatmmConfig16_950 : public FlatmmConfig16<DataType>
     static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 32 : 128;
 };
 
+template <typename DataType>
+struct FlatmmConfig16_Wmma : public FlatmmConfig16<DataType>
+{
+    static constexpr ck_tile::index_t M_Tile      = 64;
+    static constexpr ck_tile::index_t K_Tile      = 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+};
+
 template <typename ADataType>
 struct GemmBasicTypeConfig;
 
@@ -183,9 +191,12 @@ auto create_args(int argc, char* argv[])
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("split_k", "1", "splitK value")
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
-        .insert("warp_tile",
-                "0",
-                "0: 16x16, 1: 32x32, 2: 16x16x128 (950 only), 3: 32x32x64 (950 only)");
+#if !defined(CK_TILE_USE_WMMA)
+        .insert(
+            "warp_tile", "0", "0: 16x16, 1: 32x32, 2: 16x16x128 (950 only), 3: 32x32x64 (950 only)")
+#endif
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "flatmm_basic.json", "json file name to dump results");
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index ff1a239cba..c187f72594 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -2,7 +2,7 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <type_traits>
-
+#include "ck_tile/utility/json_dump.hpp"
 template <typename T>
 constexpr const char* DataTypeToString()
 {
@@ -43,15 +43,40 @@ auto shuffle_b(const ck_tile::HostTensor<T>& t)
     int n_ = t.get_lengths()[1];
     int k_ = t.get_lengths()[0];
 
-    int divisor = ck_tile::is_wave32() ? (FlatmmConfig::N_Warp_Tile == 32 ? 1 : 2)
-                                       : (FlatmmConfig::N_Warp_Tile == 32 ? 2 : 4);
-    ck_tile::HostTensor<T> t_view({n_ / FlatmmConfig::N_Warp_Tile,
-                                   FlatmmConfig::N_Warp_Tile,
-                                   k_ / FlatmmConfig::K_Warp_Tile,
-                                   divisor,
-                                   FlatmmConfig::K_Warp_Tile / divisor});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    if(ck_tile::is_gfx12_supported())
+    {
+        constexpr int divisor      = 2;
+        constexpr int kABK1PerLane = 8;
+        constexpr int kABK0PerLane = FlatmmConfig::K_Warp_Tile / divisor / kABK1PerLane;
+        ck_tile::HostTensor<T> t_view({n_ / FlatmmConfig::N_Warp_Tile,
+                                       FlatmmConfig::N_Warp_Tile,
+                                       k_ / FlatmmConfig::K_Warp_Tile,
+                                       kABK0PerLane,
+                                       divisor,
+                                       kABK1PerLane});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
+    }
+    else
+    {
+        int divisor = 1;
+        if(ck_tile::is_gfx11_supported())
+        {
+            divisor = 1;
+        }
+        else
+        {
+            assert(is_wave32() == false);
+            divisor = FlatmmConfig::N_Warp_Tile == 32 ? 2 : 4;
+        }
+        ck_tile::HostTensor<T> t_view({n_ / FlatmmConfig::N_Warp_Tile,
+                                       FlatmmConfig::N_Warp_Tile,
+                                       k_ / FlatmmConfig::K_Warp_Tile,
+                                       divisor,
+                                       FlatmmConfig::K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
 }
 
 template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
@@ -140,17 +165,6 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
                                  CDEElementWise>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
 
-    std::size_t flop = std::size_t(2) * M * N * K;
-    std::size_t num_byte =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>()
-              << " M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A
-              << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time
-              << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
-
     return ave_time;
 }
 
@@ -242,27 +256,38 @@ int run_flatmm_example_with_layouts(int argc,
     ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
     b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
 
-    invoke_flatmm<FlatmmConfig,
-                  ADataType,
-                  BDataType,
-                  ck_tile::tuple<>,
-                  AccDataType,
-                  CDataType,
-                  ALayout,
-                  BLayout,
-                  ck_tile::tuple<>,
-                  CLayout>(a_dev_buf,
-                           b_shuffle_dev_buf,
-                           c_dev_buf,
-                           M,
-                           N,
-                           K,
-                           stride_A,
-                           stride_B,
-                           stride_C,
-                           kbatch,
-                           n_warmup,
-                           n_repeat);
+    float ave_time = invoke_flatmm<FlatmmConfig,
+                                   ADataType,
+                                   BDataType,
+                                   ck_tile::tuple<>,
+                                   AccDataType,
+                                   CDataType,
+                                   ALayout,
+                                   BLayout,
+                                   ck_tile::tuple<>,
+                                   CLayout>(a_dev_buf,
+                                            b_shuffle_dev_buf,
+                                            c_dev_buf,
+                                            M,
+                                            N,
+                                            K,
+                                            stride_A,
+                                            stride_B,
+                                            stride_C,
+                                            kbatch,
+                                            n_warmup,
+                                            n_repeat);
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>()
+              << " M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A
+              << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time
+              << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
 
     c_dev_buf.FromDevice(c_rslt_host.data());
     bool pass = true;
@@ -350,5 +375,22 @@ int run_flatmm_example_with_layouts(int argc,
         std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_flatmm_json_results(arg_parser.get_str("jsonfile"),
+                                 DataTypeToString<ADataType>(),
+                                 M,
+                                 N,
+                                 K,
+                                 stride_A,
+                                 stride_B,
+                                 stride_C,
+                                 kbatch,
+                                 pass,
+                                 ave_time,
+                                 tflops,
+                                 gb_per_sec);
+    }
+
     return pass;
 }
diff --git a/example/ck_tile/19_gemm_multi_d/README.md b/example/ck_tile/19_gemm_multi_d/README.md
index 2cf2b1ea03..b9416f3112 100644
--- a/example/ck_tile/19_gemm_multi_d/README.md
+++ b/example/ck_tile/19_gemm_multi_d/README.md
@@ -17,19 +17,21 @@ This will result in an executable `build/bin/tile_example_gemm_multi_d_fp16`
 ## example
 ```
 args:
-       -m  M dimensions - (Default: 3840)
-       -n  N dimensions - (Default: 4096)
-       -k  K dimensions - (Default: 4096)
--a_layout  Tensor A layout (default:R)
--b_layout  Tensor B layout (default:C)
--ds_layout Tensor D layout (default:R)
--e_layout  Tensor E layout (default:R)
--stride_a  Tensor A strides - (Default: 0)
--stride_b  Tensor B strides - (Default: 0)
--stride_e  Tensor C strides - (Default: 0)
--stride_ds Tensor D strides - (Default: 0)
--validate  0. No validation, 1. Validation on GPU. (Default: 1)
-  -warmup  Number of iterations before benchmark the kernel. (Default: 10)
-  -repeat  Number of iterations to benchmark the kernel. (Default: 100)
-  -kbatch  kbatch for SplitK. (Default 1)
+          -m    m dimension (default:3840)
+          -n    n dimension (default:4096)
+          -k    k dimension (default:4096)
+   -a_layout    A tensor data layout - Row by default (default:R)
+   -b_layout    B tensor data layout - Col by default (default:C)
+  -ds_layout    Ds tensor data layout - Row by default (default:R)
+   -e_layout    E tensor data layout - Row by default (default:R)
+   -stride_a    Tensor A stride (default:0)
+   -stride_b    Tensor B stride (default:0)
+  -stride_ds    Tensor Ds stride (default:0)
+   -stride_e    Tensor E stride (default:0)
+          -v    0. No validation, 1. Validation on GPU (default:1)
+     -warmup    number of iterations before benchmark the kernel (default:50)
+     -repeat    number of iterations to benchmark the kernel (default:100)
+     -kbatch    kbatch for SplitK (default:1)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:cktile_gemm_multi_d_fp16.json)
 ```
diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
index fc52cb66cc..ecb0e3df48 100644
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
@@ -17,7 +17,8 @@
 #include "gemm_multi_d_fp16.hpp"
 #include "utils.hpp"
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -29,58 +30,22 @@ template <typename ADataType,
           typename CDEElementWise = ck_tile::element_wise::PassThrough>
 auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config& s) -> float
 {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 32;
-    constexpr ck_tile::index_t K_Tile = 64;
+    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
+    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
+    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
 
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 1;
-    constexpr ck_tile::index_t K_Warp = 1;
+    constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp;
+    constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp;
+    constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp;
 
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
+    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
+    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
+    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
 
-    constexpr bool DoubleSmemBuffer = false;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = true;
-#endif
-
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
+    constexpr bool DoubleSmemBuffer = GemmConfig::DoubleSmemBuffer;
+    constexpr bool kPadM            = false;
+    constexpr bool kPadN            = false;
+    constexpr bool kPadK            = false;
 
     constexpr bool TransposeC = false;
 
@@ -109,7 +74,8 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
-    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
 
     const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
     const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
@@ -123,7 +89,7 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
         [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
             constexpr bool has_hot_loop_v   = has_hot_loop_.value;
             constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto scheduler        = GemmConfig::Scheduler;
             constexpr auto memory_operation = memory_operation_.value;
 
             using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
@@ -135,7 +101,8 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
                                                                                has_hot_loop_v,
                                                                                tail_number_v>;
 
-            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+            using GemmPipeline = typename PipelineTypeTraits<
+                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
 
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
@@ -203,4 +170,11 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
 
 #include "run_gemm_multi_d_fp16_example.inc"
 
-int main(int argc, char* argv[]) { return !run_multiple_d_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+#if CK_TILE_USE_WMMA
+    return !run_multiple_d_gemm_example<GemmConfigV3_Wmma>(argc, argv);
+#else
+    return !run_multiple_d_gemm_example<GemmConfigV3>(argc, argv);
+#endif
+}
diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
index 87b9592553..a7ae227627 100644
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
@@ -13,26 +13,6 @@
 #define CK_TILE_PIPELINE_MEMORY 2
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 
-#ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
-#endif
-
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#else
-#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
-#endif
-
 using ADataType   = ck_tile::half_t;
 using BDataType   = ck_tile::half_t;
 using D0DataType  = ck_tile::half_t;
@@ -41,6 +21,117 @@ using EDataType   = ck_tile::half_t;
 using DsDataType  = ck_tile::tuple<D0DataType, D1DataType>;
 using AccDataType = float;
 
+struct GemmConfigMemory
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+struct GemmConfigV3
+{
+    // Compute friendly for Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+struct GemmConfigV4
+{
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 32;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+struct GemmConfigV3_Wmma
+{
+    // Compute friendly for Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
@@ -58,7 +149,9 @@ auto create_args(int argc, char* argv[])
         .insert("v", "1", "0. No validation, 1. Validation on GPU")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("kbatch", "1", "kbatch for SplitK");
+        .insert("kbatch", "1", "kbatch for SplitK")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "cktile_gemm_multi_d_fp16.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -66,7 +159,8 @@ auto create_args(int argc, char* argv[])
 
 using gemm_multi_d_kargs = ck_tile::GemmMultiDHostArgs<DsDataType::size()>;
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
diff --git a/example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc b/example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc
index a0d7157d03..6a6a5a3afa 100644
--- a/example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc
+++ b/example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc
@@ -3,8 +3,10 @@
 
 #pragma once
 #include <cstddef>
+#include "ck_tile/utility/json_dump.hpp"
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -42,7 +44,8 @@ float invoke_gemm_multi_d(const void* a_m_k_dev_buf,
                                    StrideDs,
                                    StrideE});
 
-    float ave_time = gemm_multi_d<ADataType,
+    float ave_time = gemm_multi_d<GemmConfig,
+                                  ADataType,
                                   BDataType,
                                   DsDataType,
                                   AccDataType,
@@ -54,34 +57,11 @@ float invoke_gemm_multi_d(const void* a_m_k_dev_buf,
                                   CDEElementWise>(
         gemm_descs, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
-    std::string op_name{"Gemm Multiple-D"};
-    static constexpr ck_tile::index_t NumDTensor = DsDataType::size();
-
-    std::size_t flop = 0, num_btype = 0;
-
-    flop += std::size_t(2) * M * N * K;
-
-    ck_tile::static_for<0, NumDTensor, 1>{}([&](auto i) {
-        num_btype += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) * M * N;
-        flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) * M * N;
-    });
-
-    num_btype += sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
-
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Run Gemm Multiple-D kernel with:\n";
-    std::cout << "M =" << M << " N =" << N << " K =" << K << "\n";
-    std::cout << "StrideA = " << StrideA << " StrideB = " << StrideB << " StrideE = " << StrideE
-              << "\n";
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << "\n";
-
     return ave_time;
 }
 
-template <typename ALayout,
+template <typename GemmConfig,
+          typename ALayout,
           typename BLayout,
           typename D0Layout,
           typename D1Layout,
@@ -159,29 +139,54 @@ int run_multiple_d_gemm_example_with_layouts(int argc,
 
     std::array<ck_tile::index_t, DsDataType::size()> stridesDs = {StrideD0, StrideD1};
 
-    invoke_gemm_multi_d<ADataType,
-                        BDataType,
-                        DsDataType,
-                        AccDataType,
-                        EDataType,
-                        ALayout,
-                        BLayout,
-                        DsLayout,
-                        ELayout,
-                        CDElementWiseFn>(a_m_k_dev_buf.GetDeviceBuffer(),
-                                         b_k_n_dev_buf.GetDeviceBuffer(),
-                                         ds_ptr_buf,
-                                         e_m_n_dev_buf.GetDeviceBuffer(),
-                                         M,
-                                         N,
-                                         K,
-                                         StrideA,
-                                         StrideB,
-                                         stridesDs,
-                                         StrideE,
-                                         n_warmup,
-                                         n_repeat,
-                                         k_batch);
+    float ave_time = invoke_gemm_multi_d<GemmConfig,
+                                         ADataType,
+                                         BDataType,
+                                         DsDataType,
+                                         AccDataType,
+                                         EDataType,
+                                         ALayout,
+                                         BLayout,
+                                         DsLayout,
+                                         ELayout,
+                                         CDElementWiseFn>(a_m_k_dev_buf.GetDeviceBuffer(),
+                                                          b_k_n_dev_buf.GetDeviceBuffer(),
+                                                          ds_ptr_buf,
+                                                          e_m_n_dev_buf.GetDeviceBuffer(),
+                                                          M,
+                                                          N,
+                                                          K,
+                                                          StrideA,
+                                                          StrideB,
+                                                          stridesDs,
+                                                          StrideE,
+                                                          n_warmup,
+                                                          n_repeat,
+                                                          k_batch);
+
+    std::string op_name{"Gemm Multiple-D"};
+    static constexpr ck_tile::index_t NumDTensor = DsDataType::size();
+
+    std::size_t flop = 0, num_btype = 0;
+
+    flop += std::size_t(2) * M * N * K;
+
+    ck_tile::static_for<0, NumDTensor, 1>{}([&](auto i) {
+        num_btype += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) * M * N;
+        flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) * M * N;
+    });
+
+    num_btype += sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm Multiple-D kernel with:\n";
+    std::cout << "M =" << M << " N =" << N << " K =" << K << "\n";
+    std::cout << "StrideA = " << StrideA << " StrideB = " << StrideB << " StrideE = " << StrideE
+              << "\n";
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << "\n";
 
     e_m_n_dev_buf.FromDevice(e_m_n_device_result.data());
 
@@ -217,9 +222,28 @@ int run_multiple_d_gemm_example_with_layouts(int argc,
                   << std::endl;
         std::cout << "The CPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
     }
+
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_gemm_multi_d_fp16_json_results(arg_parser.get_str("jsonfile"),
+                                            op_name,
+                                            M,
+                                            N,
+                                            K,
+                                            StrideA,
+                                            StrideB,
+                                            StrideD0,
+                                            StrideD1,
+                                            StrideE,
+                                            pass,
+                                            ave_time,
+                                            tflops,
+                                            gb_per_sec);
+    }
     return pass;
 }
 
+template <typename GemmConfig>
 int run_multiple_d_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -237,7 +261,7 @@ int run_multiple_d_gemm_example(int argc, char* argv[])
 
     if(a_layout == "R" && b_layout == "C" && ds_layout == "R")
     {
-        return run_multiple_d_gemm_example_with_layouts(
+        return run_multiple_d_gemm_example_with_layouts<GemmConfig>(
             argc, argv, Row{}, Col{}, Row{}, Row{}, Row{});
     }
     else
diff --git a/example/ck_tile/20_grouped_convolution/CMakeLists.txt b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
index 5cb1d2650e..10332137e2 100644
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -7,5 +7,8 @@ target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMP
 add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp)
 target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 
+add_executable(tile_example_grouped_conv_bwd_weight_two_stage EXCLUDE_FROM_ALL grouped_convolution_backward_weight_two_stage.cpp)
+target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
 add_executable(tile_example_grouped_conv_bwd_data EXCLUDE_FROM_ALL grouped_convolution_backward_data.cpp)
 target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
index 52eaab9f94..fa914a7119 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
@@ -11,182 +11,14 @@
 
 #include "ck_tile/host.hpp"
 #include "grouped_convolution_utils.hpp"
-
-template <ck_tile::index_t NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename AccDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout,
-          typename DsDataType     = ck_tile::tuple<>,
-          typename DsLayout       = ck_tile::tuple<>,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args,
-                            const ck_tile::stream_config& s)
-{
-    constexpr int kBlockPerCu = 1;
-
-    constexpr ck_tile::index_t M_Tile = 64;
-    constexpr ck_tile::index_t N_Tile = 64;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr ck_tile::index_t VectorSizeA = 8;
-    constexpr ck_tile::index_t VectorSizeB = 8;
-    constexpr ck_tile::index_t VectorSizeC = 8;
-
-    // Implicit GEMM Traits
-    using CodegenShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
-    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
-    using GroupedConvTraitsType =
-        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
-    using CodegenPipelineProblem =
-        ck_tile::GemmPipelineProblem<InDataType,
-                                     WeiDataType,
-                                     AccDataType,
-                                     CodegenShape,
-                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
-                                     InDataType,
-                                     true,
-                                     VectorSizeA,
-                                     VectorSizeB>;
-    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using ConvEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<InDataType,
-                                             WeiDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             OutDataType,
-                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                                             ck_tile::tensor_layout::gemm::RowMajor,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation,
-                                             1,
-                                             true,
-                                             VectorSizeC>>;
-
-        using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
-                                                                     TilePartitioner,
-                                                                     CodegenPipeline,
-                                                                     ConvEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << '\n'
-                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
-                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
-                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    if(args.k_batch == 1)
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::atomic_add>{});
-    }
-}
-
+#include "grouped_convolution_backward_data_invoker.hpp"
 #include "run_grouped_convolution_bwd_data_example.inc"
 
-template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
-int run_grouped_conv_bwd_data_example_prec_type(
-    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
-{
-    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
-    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
-    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
-
-    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
-    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
-    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
-
-    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
-    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
-    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
-
-    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
-    {
-        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<1>{},
-                                                              InPrecType,
-                                                              WeiPrecType,
-                                                              OutPrecType>(
-            argc, argv, NWGC{}, GKXC{}, NWGK{});
-    }
-    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
-    {
-        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<2>{},
-                                                              InPrecType,
-                                                              WeiPrecType,
-                                                              OutPrecType>(
-            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
-    }
-    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
-    {
-        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<3>{},
-                                                              InPrecType,
-                                                              WeiPrecType,
-                                                              OutPrecType>(
-            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported memory layout!");
-    }
-}
-
+template <typename GemmWarpConfig>
 int run_grouped_conv_bwd_data_example(int argc, char* argv[])
 {
+    using Invoker = GroupedConvolutionBackwardDataInvoker;
+
     auto [result, arg_parser] = create_args(argc, argv);
     if(!result)
         return -1;
@@ -198,12 +30,16 @@ int run_grouped_conv_bwd_data_example(int argc, char* argv[])
 
     if(data_type == "fp16")
     {
-        return run_grouped_conv_bwd_data_example_prec_type<ck_tile::half_t>(
+        return run_grouped_conv_bwd_data_example_prec_type<Invoker,
+                                                           GemmWarpConfig,
+                                                           ck_tile::half_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else if(data_type == "bf16")
     {
-        return run_grouped_conv_bwd_data_example_prec_type<ck_tile::bf16_t>(
+        return run_grouped_conv_bwd_data_example_prec_type<Invoker,
+                                                           GemmWarpConfig,
+                                                           ck_tile::bf16_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else
@@ -212,4 +48,11 @@ int run_grouped_conv_bwd_data_example(int argc, char* argv[])
     }
 }
 
-int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_data_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+#if CK_TILE_USE_WMMA
+    return !run_grouped_conv_bwd_data_example<GemmWarpConfig_Wmma>(argc, argv);
+#else
+    return !run_grouped_conv_bwd_data_example<GemmWarpConfig_Mfma>(argc, argv);
+#endif
+}
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp
new file mode 100644
index 0000000000..1b3d45427d
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "grouped_convolution_utils.hpp"
+
+struct GroupedConvolutionBackwardDataInvoker
+{
+
+    template <ck_tile::index_t NDimSpatial,
+              typename GemmWarpConfig,
+              typename InDataType,
+              typename WeiDataType,
+              typename AccDataType,
+              typename OutDataType,
+              typename InLayout,
+              typename WeiLayout,
+              typename OutLayout,
+              typename DsDataType     = ck_tile::tuple<>,
+              typename DsLayout       = ck_tile::tuple<>,
+              typename CDEElementWise = ck_tile::element_wise::PassThrough>
+    static float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args,
+                                       const ck_tile::stream_config& s)
+    {
+        constexpr int kBlockPerCu = 1;
+
+        constexpr ck_tile::index_t M_Tile = 64;
+        constexpr ck_tile::index_t N_Tile = 64;
+        constexpr ck_tile::index_t K_Tile = 32;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
+
+        constexpr ck_tile::index_t VectorSizeA = 1;
+        constexpr ck_tile::index_t VectorSizeB = 1;
+        constexpr ck_tile::index_t VectorSizeC = 8;
+
+        // Implicit GEMM Traits
+        using CodegenShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
+        using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+        using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
+                                                                  ConvSpec,
+                                                                  InLayout,
+                                                                  WeiLayout,
+                                                                  DsLayout,
+                                                                  OutLayout,
+                                                                  VectorSizeA,
+                                                                  VectorSizeB,
+                                                                  VectorSizeC>;
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
+            InDataType,
+            WeiDataType,
+            AccDataType,
+            CodegenShape,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData,
+            ck_tile::element_wise::PassThrough,
+            ck_tile::element_wise::PassThrough,
+            InDataType,
+            true,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
+        using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                InDataType,
+                WeiDataType,
+                DsDataType,
+                AccDataType,
+                OutDataType,
+                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                ck_tile::tensor_layout::gemm::RowMajor,
+                CDEElementWise,
+                TilePartitioner::MPerBlock,
+                TilePartitioner::NPerBlock,
+                M_Warp,
+                N_Warp,
+                M_Warp_Tile,
+                N_Warp_Tile,
+                K_Warp_Tile,
+                CodegenPipelineProblem::TransposeC,
+                memory_operation,
+                1,
+                true,
+                GroupedConvTraitsType::VectorSizeC>>;
+
+            using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
+                                                                         TilePartitioner,
+                                                                         CodegenPipeline,
+                                                                         ConvEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(args);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << CodegenShape::GetName() << '\n'
+                          << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << '\n'
+                          << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                          << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+            }
+
+            float ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
+        if(args.k_batch == 1)
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    }
+};
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
index debbb6bc0c..82614bbb13 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
@@ -11,187 +11,13 @@
 
 #include "ck_tile/host.hpp"
 #include "grouped_convolution_utils.hpp"
-
-template <ck_tile::index_t NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename AccDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout,
-          typename DsDataType     = ck_tile::tuple<>,
-          typename DsLayout       = ck_tile::tuple<>,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
-                              const ck_tile::stream_config& s)
-{
-    constexpr int kBlockPerCu = 1;
-
-    constexpr ck_tile::index_t M_Tile = 64;
-    constexpr ck_tile::index_t N_Tile = 64;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr ck_tile::index_t VectorSizeA = 8;
-    constexpr ck_tile::index_t VectorSizeB = 8;
-    constexpr ck_tile::index_t VectorSizeC = 8;
-
-    // Implicit GEMM Traits
-    using CodegenShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
-    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
-    using GroupedConvTraitsType =
-        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
-    using CodegenPipelineProblem =
-        ck_tile::GemmPipelineProblem<InDataType,
-                                     WeiDataType,
-                                     AccDataType,
-                                     CodegenShape,
-                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
-                                     InDataType,
-                                     true,
-                                     VectorSizeA,
-                                     VectorSizeB>;
-    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using ConvEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<InDataType,
-                                             WeiDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             OutDataType,
-                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                                             ck_tile::tensor_layout::gemm::RowMajor,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation,
-                                             1,
-                                             true,
-                                             VectorSizeC>>;
-
-        using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
-                                                                       TilePartitioner,
-                                                                       CodegenPipeline,
-                                                                       ConvEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(kargs);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << '\n'
-                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
-                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
-                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel_time_mask(
-            s,
-            Kernel::Preprocess(kargs, s),
-            ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    if(args.k_batch == 1)
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::atomic_add>{});
-    }
-}
-
+#include "grouped_convolution_backward_weight_invoker.hpp"
 #include "run_grouped_convolution_bwd_weight_example.inc"
 
-template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
-int run_grouped_conv_bwd_weight_example_prec_type(
-    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+template <typename GemmWarpConfig>
+int run_grouped_conv_bwd_weight_example(ck_tile::ArgParser& arg_parser)
 {
-    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
-    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
-    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
-
-    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
-    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
-    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
-
-    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
-    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
-    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
-
-    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
-    {
-        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<1>{},
-                                                                InPrecType,
-                                                                WeiPrecType,
-                                                                OutPrecType>(
-            argc, argv, NWGC{}, GKXC{}, NWGK{});
-    }
-    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
-    {
-        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<2>{},
-                                                                InPrecType,
-                                                                WeiPrecType,
-                                                                OutPrecType>(
-            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
-    }
-    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
-    {
-        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<3>{},
-                                                                InPrecType,
-                                                                WeiPrecType,
-                                                                OutPrecType>(
-            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported memory layout!");
-    }
-}
-
-int run_grouped_conv_bwd_weight_example(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
+    using Invoker = GroupedConvolutionBackwardWeightInvoker;
 
     std::string data_type  = arg_parser.get_str("prec");
     std::string in_layout  = arg_parser.get_str("in_layout");
@@ -200,13 +26,17 @@ int run_grouped_conv_bwd_weight_example(int argc, char* argv[])
 
     if(data_type == "fp16")
     {
-        return run_grouped_conv_bwd_weight_example_prec_type<ck_tile::half_t>(
-            in_layout, wei_layout, out_layout, argc, argv);
+        return run_grouped_conv_bwd_weight_example_prec_type<Invoker,
+                                                             GemmWarpConfig,
+                                                             ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, arg_parser);
     }
     else if(data_type == "bf16")
     {
-        return run_grouped_conv_bwd_weight_example_prec_type<ck_tile::bf16_t>(
-            in_layout, wei_layout, out_layout, argc, argv);
+        return run_grouped_conv_bwd_weight_example_prec_type<Invoker,
+                                                             GemmWarpConfig,
+                                                             ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, arg_parser);
     }
     else
     {
@@ -214,4 +44,24 @@ int run_grouped_conv_bwd_weight_example(int argc, char* argv[])
     }
 }
 
-int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_weight_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    try
+    {
+#if CK_TILE_USE_WMMA
+        return !run_grouped_conv_bwd_weight_example<GemmWarpConfig_Wmma>(arg_parser);
+#else
+        return !run_grouped_conv_bwd_weight_example<GemmWarpConfig_Mfma>(arg_parser);
+#endif
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp
new file mode 100644
index 0000000000..f46707d1d2
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "grouped_convolution_utils.hpp"
+
+struct GroupedConvolutionBackwardWeightInvoker
+{
+    template <ck_tile::index_t NDimSpatial,
+              typename GemmWarpConfig,
+              typename InDataType,
+              typename WeiDataType,
+              typename AccDataType,
+              typename OutDataType,
+              typename InLayout,
+              typename WeiLayout,
+              typename OutLayout,
+              typename DsDataType     = ck_tile::tuple<>,
+              typename DsLayout       = ck_tile::tuple<>,
+              typename CDEElementWise = ck_tile::element_wise::PassThrough>
+    static float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
+                                         const ck_tile::stream_config& s)
+    {
+        constexpr int kBlockPerCu = 1;
+
+        constexpr ck_tile::index_t M_Tile = 64;
+        constexpr ck_tile::index_t N_Tile = 64;
+        constexpr ck_tile::index_t K_Tile = 64;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
+
+        constexpr ck_tile::index_t VectorSizeA = 1;
+        constexpr ck_tile::index_t VectorSizeB = 1;
+        constexpr ck_tile::index_t VectorSizeC = 8;
+
+        // Implicit GEMM Traits
+        using CodegenShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
+        using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+        using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
+                                                                  ConvSpec,
+                                                                  InLayout,
+                                                                  WeiLayout,
+                                                                  DsLayout,
+                                                                  OutLayout,
+                                                                  VectorSizeA,
+                                                                  VectorSizeB,
+                                                                  VectorSizeC>;
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
+            InDataType,
+            WeiDataType,
+            AccDataType,
+            CodegenShape,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight,
+            ck_tile::element_wise::PassThrough,
+            ck_tile::element_wise::PassThrough,
+            InDataType,
+            true,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
+        using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                InDataType,
+                WeiDataType,
+                DsDataType,
+                AccDataType,
+                OutDataType,
+                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                ck_tile::tensor_layout::gemm::RowMajor,
+                CDEElementWise,
+                TilePartitioner::MPerBlock,
+                TilePartitioner::NPerBlock,
+                M_Warp,
+                N_Warp,
+                M_Warp_Tile,
+                N_Warp_Tile,
+                K_Warp_Tile,
+                CodegenPipelineProblem::TransposeC,
+                memory_operation,
+                1,
+                true,
+                GroupedConvTraitsType::VectorSizeC>>;
+
+            using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
+                                                                           TilePartitioner,
+                                                                           CodegenPipeline,
+                                                                           ConvEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(kargs);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << CodegenShape::GetName() << '\n'
+                          << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << '\n'
+                          << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                          << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+            }
+
+            float ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                Kernel::Preprocess(kargs, s),
+                ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
+        if(args.k_batch == 1)
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    }
+};
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage.cpp
new file mode 100644
index 0000000000..82068156e5
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "grouped_convolution_utils.hpp"
+#include "grouped_convolution_backward_weight_two_stage_invoker.hpp"
+#include "run_grouped_convolution_bwd_weight_example.inc"
+
+template <typename GemmWarpConfig>
+int run_grouped_conv_bwd_weight_example(ck_tile::ArgParser& arg_parser)
+{
+    using Invoker = GroupedConvolutionBackwardWeightTwoStageInvoker;
+
+    std::string data_type  = arg_parser.get_str("prec");
+    std::string in_layout  = arg_parser.get_str("in_layout");
+    std::string wei_layout = arg_parser.get_str("wei_layout");
+    std::string out_layout = arg_parser.get_str("out_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<Invoker,
+                                                             GemmWarpConfig,
+                                                             ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, arg_parser);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<Invoker,
+                                                             GemmWarpConfig,
+                                                             ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, arg_parser);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    try
+    {
+#if CK_TILE_USE_WMMA
+        return !run_grouped_conv_bwd_weight_example<GemmWarpConfig_Wmma>(arg_parser);
+#else
+        return !run_grouped_conv_bwd_weight_example<GemmWarpConfig_Mfma>(arg_parser);
+#endif
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
new file mode 100644
index 0000000000..a8f321e7f9
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "grouped_convolution_utils.hpp"
+
+struct GroupedConvolutionBackwardWeightTwoStageInvoker
+{
+    template <ck_tile::index_t NDimSpatial,
+              typename GemmWarpConfig,
+              typename InDataType,
+              typename WeiDataType,
+              typename AccDataType,
+              typename OutDataType,
+              typename InLayout,
+              typename WeiLayout,
+              typename OutLayout,
+              typename DsDataType     = ck_tile::tuple<>,
+              typename DsLayout       = ck_tile::tuple<>,
+              typename CDEElementWise = ck_tile::element_wise::PassThrough>
+    static float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
+                                         const ck_tile::stream_config& s)
+    {
+        using WorkspaceDataType = float;
+
+        constexpr int kBlockPerCu = 1;
+
+        constexpr ck_tile::index_t M_Tile = 64;
+        constexpr ck_tile::index_t N_Tile = 64;
+        constexpr ck_tile::index_t K_Tile = 64;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
+
+        constexpr ck_tile::index_t VectorSizeA = 1;
+        constexpr ck_tile::index_t VectorSizeB = 1;
+        constexpr ck_tile::index_t VectorSizeC = 1;
+
+        // Implicit GEMM Traits
+        using CodegenShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
+        using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+        using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
+                                                                  ConvSpec,
+                                                                  InLayout,
+                                                                  WeiLayout,
+                                                                  DsLayout,
+                                                                  OutLayout,
+                                                                  VectorSizeA,
+                                                                  VectorSizeB,
+                                                                  VectorSizeC>;
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
+            OutDataType, // A: Out
+            InDataType,  // B: In
+            AccDataType,
+            CodegenShape,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight,
+            ck_tile::element_wise::PassThrough,
+            ck_tile::element_wise::PassThrough,
+            InDataType,
+            true,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
+        using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                OutDataType, // A: Out
+                InDataType,  // B: In
+                DsDataType,
+                AccDataType,
+                WorkspaceDataType, // C: Workspace  normally Out
+                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                ck_tile::tensor_layout::gemm::RowMajor,
+                CDEElementWise,
+                TilePartitioner::MPerBlock,
+                TilePartitioner::NPerBlock,
+                M_Warp,
+                N_Warp,
+                M_Warp_Tile,
+                N_Warp_Tile,
+                K_Warp_Tile,
+                CodegenPipelineProblem::TransposeC,
+                memory_operation,
+                1,
+                true,
+                GroupedConvTraitsType::VectorSizeC>>;
+
+            using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
+                                                                           TilePartitioner,
+                                                                           CodegenPipeline,
+                                                                           ConvEpilogue>;
+
+            const ck_tile::index_t spatial_lengths_accum =
+                std::accumulate(args.filter_spatial_lengths_.begin(),
+                                args.filter_spatial_lengths_.end(),
+                                1,
+                                std::multiplies<ck_tile::index_t>());
+            ck_tile::DeviceMem ws_m_n_dev_buf(args.G_ * args.K_ * args.C_ * spatial_lengths_accum *
+                                              sizeof(WorkspaceDataType));
+            ck_tile::GroupedConvBwdWeightHostArgs ws_args =
+                ck_tile::GroupedConvBwdWeightHostArgs(args);
+            auto c_ptr      = ws_args.wei_ptr;
+            ws_args.wei_ptr = ws_m_n_dev_buf.GetDeviceBuffer();
+            auto kargs      = Kernel::MakeKernelArgs(ws_args);
+
+            const dim3 grids  = Kernel::GridSize(kargs);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+            }
+
+            using XElementwiseOperation = ck_tile::element_wise::UnaryConvert;
+            using BlockTile             = ck_tile::sequence<2048>;
+            using BlockWarps            = ck_tile::sequence<8>;
+            using WarpTile              = ck_tile::sequence<64>;
+
+            using ElementwiseShape =
+                ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, WorkspaceDataType>;
+            using Problem = ck_tile::ElementWisePipelineProblem<WorkspaceDataType,
+                                                                WorkspaceDataType,
+                                                                WeiDataType,
+                                                                ElementwiseShape,
+                                                                XElementwiseOperation>;
+            using ElementwiseKernel =
+                ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+            ck_tile::index_t total_elements     = 1;
+            std::vector<ck_tile::index_t> shape = {
+                static_cast<ck_tile::index_t>(args.G_ * args.K_),
+                static_cast<ck_tile::index_t>(args.C_ * spatial_lengths_accum)};
+
+            for(auto d : shape)
+                total_elements *= d;
+
+            const ck_tile::index_t kBlockSize = ElementwiseKernel::BlockSize();
+
+            constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+            ck_tile::index_t kGridSize =
+                (total_elements + elements_per_block - 1) / elements_per_block;
+
+            auto input_tensors =
+                ck_tile::make_tuple(static_cast<WorkspaceDataType*>(ws_args.wei_ptr));
+            auto input_size = ck_tile::make_tuple(shape[0], shape[1]);
+
+            // Check if the kernel configuration is supported
+            if(!ElementwiseKernel::IsSupportedArgument(input_size))
+            {
+                throw std::runtime_error(
+                    "Wrong! Elementwise arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << CodegenShape::GetName() << '\n'
+                          << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << '\n'
+                          << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                          << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+            }
+
+            auto preprocess = [&]() {
+                if(args.k_batch > 1)
+                    ck_tile::hip_check_error(
+                        hipMemsetAsync(ws_args.wei_ptr,
+                                       0,
+                                       shape[0] * shape[1] * sizeof(WorkspaceDataType),
+                                       s.stream_id_));
+            };
+
+            return ck_tile::launch_kernel_time_mask(
+                s,
+                preprocess,
+                ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs),
+                ck_tile::make_kernel<kBlockPerCu>(ElementwiseKernel{},
+                                                  kGridSize,
+                                                  kBlockSize,
+                                                  0,
+                                                  input_size,
+                                                  ck_tile::make_tuple(shape[1], 1), // Input Stride
+                                                  ck_tile::make_tuple(shape[1], 1), // Output Stride
+                                                  input_tensors,
+                                                  static_cast<WeiDataType*>(c_ptr)));
+        };
+
+        if(args.k_batch == 1)
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    }
+};
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
index 6700970583..4cddbae3ab 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -11,173 +11,14 @@
 
 #include "ck_tile/host.hpp"
 #include "grouped_convolution_utils.hpp"
-
-template <ck_tile::index_t NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename AccDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout,
-          typename DsDataType     = ck_tile::tuple<>,
-          typename DsLayout       = ck_tile::tuple<>,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_tile::stream_config& s)
-{
-    constexpr int kBlockPerCu = 1;
-
-    constexpr ck_tile::index_t M_Tile = 64;
-    constexpr ck_tile::index_t N_Tile = 64;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr ck_tile::index_t VectorSizeA = 8;
-    constexpr ck_tile::index_t VectorSizeB = 8;
-    constexpr ck_tile::index_t VectorSizeC = 8;
-
-    // Implicit GEMM Traits
-    using CodegenShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
-    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
-    using GroupedConvTraitsType =
-        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
-    using CodegenPipelineProblem =
-        ck_tile::GemmPipelineProblem<InDataType,
-                                     WeiDataType,
-                                     AccDataType,
-                                     CodegenShape,
-                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
-                                     InDataType,
-                                     true,
-                                     VectorSizeA,
-                                     VectorSizeB>;
-    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using ConvEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<InDataType,
-                                             WeiDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             OutDataType,
-                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                                             ck_tile::tensor_layout::gemm::RowMajor,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation,
-                                             1,
-                                             true,
-                                             VectorSizeC>>;
-
-        using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
-                                                                TilePartitioner,
-                                                                CodegenPipeline,
-                                                                ConvEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(kargs);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << '\n'
-                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
-                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
-                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                          ck_tile::memory_operation_enum::set>{});
-}
-
+#include "grouped_convolution_forward_invoker.hpp"
 #include "run_grouped_convolution_fwd_example.inc"
 
-template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
-int run_grouped_conv_fwd_example_prec_type(
-    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
-{
-    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
-    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
-    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
-
-    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
-    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
-    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
-
-    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
-    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
-    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
-
-    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
-    {
-        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
-                                                         InPrecType,
-                                                         WeiPrecType,
-                                                         OutPrecType>(
-            argc, argv, NWGC{}, GKXC{}, NWGK{});
-    }
-    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
-    {
-        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
-                                                         InPrecType,
-                                                         WeiPrecType,
-                                                         OutPrecType>(
-            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
-    }
-    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "GKZYXC")
-    {
-        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
-                                                         InPrecType,
-                                                         WeiPrecType,
-                                                         OutPrecType>(
-            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported memory layout!");
-    }
-}
-
+template <typename GemmWarpConfig>
 int run_grouped_conv_fwd_example(int argc, char* argv[])
 {
+    using Invoker = GroupedConvolutionForwardInvoker;
+
     auto [result, arg_parser] = create_args(argc, argv);
     if(!result)
         return -1;
@@ -189,12 +30,12 @@ int run_grouped_conv_fwd_example(int argc, char* argv[])
 
     if(data_type == "fp16")
     {
-        return run_grouped_conv_fwd_example_prec_type<ck_tile::half_t>(
+        return run_grouped_conv_fwd_example_prec_type<Invoker, GemmWarpConfig, ck_tile::half_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else if(data_type == "bf16")
     {
-        return run_grouped_conv_fwd_example_prec_type<ck_tile::bf16_t>(
+        return run_grouped_conv_fwd_example_prec_type<Invoker, GemmWarpConfig, ck_tile::bf16_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else
@@ -203,4 +44,11 @@ int run_grouped_conv_fwd_example(int argc, char* argv[])
     }
 }
 
-int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+#if CK_TILE_USE_WMMA
+    return !run_grouped_conv_fwd_example<GemmWarpConfig_Wmma>(argc, argv);
+#else
+    return !run_grouped_conv_fwd_example<GemmWarpConfig_Mfma>(argc, argv);
+#endif
+}
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
new file mode 100644
index 0000000000..0b9879d247
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "grouped_convolution_utils.hpp"
+
+struct GroupedConvolutionForwardInvoker
+{
+    template <ck_tile::index_t NDimSpatial,
+              typename GemmWarpConfig,
+              typename InDataType,
+              typename WeiDataType,
+              typename AccDataType,
+              typename OutDataType,
+              typename InLayout,
+              typename WeiLayout,
+              typename OutLayout,
+              typename DsDataType     = ck_tile::tuple<>,
+              typename DsLayout       = ck_tile::tuple<>,
+              typename CDEElementWise = ck_tile::element_wise::PassThrough>
+    static float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
+                                  const ck_tile::stream_config& s)
+    {
+        constexpr int kBlockPerCu = 1;
+
+        constexpr ck_tile::index_t M_Tile = 64;
+        constexpr ck_tile::index_t N_Tile = 64;
+        constexpr ck_tile::index_t K_Tile = 64;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
+
+        constexpr ck_tile::index_t VectorSizeA = 8;
+        constexpr ck_tile::index_t VectorSizeB = 8;
+        constexpr ck_tile::index_t VectorSizeC = 8;
+
+        // Implicit GEMM Traits
+        using CodegenShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
+        using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+        using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
+                                                                  ConvSpec,
+                                                                  InLayout,
+                                                                  WeiLayout,
+                                                                  DsLayout,
+                                                                  OutLayout,
+                                                                  VectorSizeA,
+                                                                  VectorSizeB,
+                                                                  VectorSizeC>;
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
+            InDataType,
+            WeiDataType,
+            AccDataType,
+            CodegenShape,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd,
+            ck_tile::element_wise::PassThrough,
+            ck_tile::element_wise::PassThrough,
+            InDataType,
+            true,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
+        using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                InDataType,
+                WeiDataType,
+                DsDataType,
+                AccDataType,
+                OutDataType,
+                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                ck_tile::tensor_layout::gemm::RowMajor,
+                CDEElementWise,
+                TilePartitioner::MPerBlock,
+                TilePartitioner::NPerBlock,
+                M_Warp,
+                N_Warp,
+                M_Warp_Tile,
+                N_Warp_Tile,
+                K_Warp_Tile,
+                CodegenPipelineProblem::TransposeC,
+                memory_operation,
+                1,
+                true,
+                GroupedConvTraitsType::VectorSizeC>>;
+
+            using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                                    TilePartitioner,
+                                                                    CodegenPipeline,
+                                                                    ConvEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(kargs);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << CodegenShape::GetName() << '\n'
+                          << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << '\n'
+                          << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                          << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+            }
+
+            float ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
+    }
+};
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
index f3a7a60fd9..ab2ce61b52 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
@@ -12,6 +12,20 @@
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/grouped_convolution.hpp"
 
+struct GemmWarpConfig_Mfma
+{
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+};
+
+struct GemmWarpConfig_Wmma
+{
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+};
+
 template <typename InDataType, typename WeiDataType, typename AccDataType, typename OutDataType>
 auto calculate_rtol_atol(const ck_tile::index_t GemmK,
                          const ck_tile::index_t kbatch,
@@ -120,12 +134,9 @@ auto create_args(int argc, char* argv[])
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("split_k", "1", "splitK value")
-        .insert("init", "0", "0:random, 1:linear, 2:constant(1)");
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
-
-// host API
-float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
-                       const ck_tile::stream_config& s);
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
index d1cf4fade7..3d7635bf4f 100644
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
@@ -3,6 +3,8 @@
 #pragma once
 
 template <ck_tile::index_t NDimSpatial,
+          typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType,
           typename AccDataType,
@@ -14,14 +16,15 @@ float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args,
                                    int n_warmup,
                                    int n_repeat)
 {
-    float ave_time = grouped_conv_bwd_data<NDimSpatial,
-                                           InDataType,
-                                           WeiDataType,
-                                           AccDataType,
-                                           OutDataType,
-                                           InLayout,
-                                           WeiLayout,
-                                           OutLayout>(
+    float ave_time = Invoker::template grouped_conv_bwd_data<NDimSpatial,
+                                                             GemmWarpConfig,
+                                                             InDataType,
+                                                             WeiDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             InLayout,
+                                                             WeiLayout,
+                                                             OutLayout>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop     = args.GetFlops();
@@ -36,6 +39,8 @@ float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args,
 }
 
 template <ck_tile::index_t NDimSpatial,
+          typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType = InDataType,
           typename OutDataType = InDataType,
@@ -136,6 +141,8 @@ int run_grouped_conv_bwd_data_example_with_layouts(
     std::cout << "output: " << output.mDesc << std::endl;
 
     invoke_grouped_conv_bwd_data<NDimSpatial,
+                                 GemmWarpConfig,
+                                 Invoker,
                                  InDataType,
                                  WeiDataType,
                                  AccDataType,
@@ -184,3 +191,59 @@ int run_grouped_conv_bwd_data_example_with_layouts(
 
     return pass;
 }
+
+template <typename Invoker,
+          typename GemmWarpConfig,
+          typename InPrecType,
+          typename WeiPrecType = InPrecType,
+          typename OutPrecType = InPrecType>
+int run_grouped_conv_bwd_data_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<1>{},
+                                                              GemmWarpConfig,
+                                                              Invoker,
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<2>{},
+                                                              GemmWarpConfig,
+                                                              Invoker,
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<3>{},
+                                                              GemmWarpConfig,
+                                                              Invoker,
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
index 637ea2fbfb..5e640514f9 100644
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
@@ -3,6 +3,8 @@
 #pragma once
 
 template <ck_tile::index_t NDimSpatial,
+          typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType,
           typename AccDataType,
@@ -14,41 +16,34 @@ float invoke_grouped_conv_bwd_weight(ck_tile::GroupedConvBwdWeightHostArgs& args
                                      int n_warmup,
                                      int n_repeat)
 {
-    float ave_time = grouped_conv_bwd_weight<NDimSpatial,
-                                             InDataType,
-                                             WeiDataType,
-                                             AccDataType,
-                                             OutDataType,
-                                             InLayout,
-                                             WeiLayout,
-                                             OutLayout>(
+    float ave_time = Invoker::template grouped_conv_bwd_weight<NDimSpatial,
+                                                               GemmWarpConfig,
+                                                               InDataType,
+                                                               WeiDataType,
+                                                               AccDataType,
+                                                               OutDataType,
+                                                               InLayout,
+                                                               WeiLayout,
+                                                               OutLayout>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
-    std::size_t flop     = args.GetFlops();
-    std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
-    float tflops         = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec     = num_byte / 1.E6 / ave_time;
-
-    std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << std::endl;
-
     return ave_time;
 }
 
 template <ck_tile::index_t NDimSpatial,
+          typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType = InDataType,
           typename OutDataType = InDataType,
           typename InLayout,
           typename WeiLayout,
           typename OutLayout>
-int run_grouped_conv_bwd_weight_example_with_layouts(
-    int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout)
+int run_grouped_conv_bwd_weight_example_with_layouts(ck_tile::ArgParser& arg_parser,
+                                                     const InLayout,
+                                                     const WeiLayout,
+                                                     const OutLayout)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     using AccDataType = float;
 
     std::vector<ck_tile::index_t> filter_spatial_lengths;
@@ -130,21 +125,32 @@ int run_grouped_conv_bwd_weight_example_with_layouts(
                                                output_dev_buf.GetDeviceBuffer(),
                                                kbatch);
 
-    std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
+    std::cout << "Run Grouped Conv Bwd Weight kernel" << std::endl;
     std::cout << "input: " << input.mDesc << std::endl;
     std::cout << "weight: " << weight.mDesc << std::endl;
     std::cout << "output: " << output.mDesc << std::endl;
 
-    invoke_grouped_conv_bwd_weight<NDimSpatial,
-                                   InDataType,
-                                   WeiDataType,
-                                   AccDataType,
-                                   OutDataType,
-                                   InLayout,
-                                   WeiLayout,
-                                   OutLayout>(args, n_warmup, n_repeat);
+    float ave_time = invoke_grouped_conv_bwd_weight<NDimSpatial,
+                                                    GemmWarpConfig,
+                                                    Invoker,
+                                                    InDataType,
+                                                    WeiDataType,
+                                                    AccDataType,
+                                                    OutDataType,
+                                                    InLayout,
+                                                    WeiLayout,
+                                                    OutLayout>(args, n_warmup, n_repeat);
 
     weight_dev_buf.FromDevice(weight.data());
+
+    std::size_t flop     = args.GetFlops();
+    std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops         = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec     = num_byte / 1.E6 / ave_time;
+
+    std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
     bool pass = true;
 
     if(arg_parser.get_int("v") == 1)
@@ -185,3 +191,61 @@ int run_grouped_conv_bwd_weight_example_with_layouts(
 
     return pass;
 }
+
+template <typename Invoker,
+          typename GemmWarpConfig,
+          typename InPrecType,
+          typename WeiPrecType = InPrecType,
+          typename OutPrecType = InPrecType>
+int run_grouped_conv_bwd_weight_example_prec_type(std::string in_layout,
+                                                  std::string wei_layout,
+                                                  std::string out_layout,
+                                                  ck_tile::ArgParser& arg_parser)
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<1>{},
+                                                                GemmWarpConfig,
+                                                                Invoker,
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            arg_parser, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<2>{},
+                                                                GemmWarpConfig,
+                                                                Invoker,
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            arg_parser, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<3>{},
+                                                                GemmWarpConfig,
+                                                                Invoker,
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            arg_parser, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
index 3532e343bb..beb6005e19 100644
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
@@ -3,6 +3,8 @@
 #pragma once
 
 template <ck_tile::index_t NDimSpatial,
+          typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType,
           typename AccDataType,
@@ -14,14 +16,15 @@ float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
                               int n_warmup,
                               int n_repeat)
 {
-    float ave_time = grouped_conv_fwd<NDimSpatial,
-                                      InDataType,
-                                      WeiDataType,
-                                      AccDataType,
-                                      OutDataType,
-                                      InLayout,
-                                      WeiLayout,
-                                      OutLayout>(
+    float ave_time = Invoker::template grouped_conv_fwd<NDimSpatial,
+                                                        GemmWarpConfig,
+                                                        InDataType,
+                                                        WeiDataType,
+                                                        AccDataType,
+                                                        OutDataType,
+                                                        InLayout,
+                                                        WeiLayout,
+                                                        OutLayout>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop     = args.GetFlops();
@@ -36,6 +39,8 @@ float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
 }
 
 template <ck_tile::index_t NDimSpatial,
+          typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType = InDataType,
           typename OutDataType = InDataType,
@@ -136,6 +141,8 @@ int run_grouped_conv_fwd_example_with_layouts(
     std::cout << "output: " << output.mDesc << std::endl;
 
     invoke_grouped_conv_fwd<NDimSpatial,
+                            GemmWarpConfig,
+                            Invoker,
                             InDataType,
                             WeiDataType,
                             AccDataType,
@@ -184,3 +191,59 @@ int run_grouped_conv_fwd_example_with_layouts(
 
     return pass;
 }
+
+template <typename Invoker,
+          typename GemmWarpConfig,
+          typename InPrecType,
+          typename WeiPrecType = InPrecType,
+          typename OutPrecType = InPrecType>
+int run_grouped_conv_fwd_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
+                                                         GemmWarpConfig,
+                                                         Invoker,
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
+                                                         GemmWarpConfig,
+                                                         Invoker,
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
+                                                         GemmWarpConfig,
+                                                         Invoker,
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
diff --git a/example/ck_tile/21_elementwise/elementwise_common.hpp b/example/ck_tile/21_elementwise/elementwise_common.hpp
new file mode 100644
index 0000000000..fd7889bd35
--- /dev/null
+++ b/example/ck_tile/21_elementwise/elementwise_common.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <variant>
+#include "ck_tile/core/arch/arch.hpp"
+
+auto string_to_datatype(const std::string& datatype)
+{
+    using PrecVariant = std::variant<ck_tile::half_t, ck_tile::bf16_t, float>;
+
+    if(datatype == "fp16")
+    {
+        return PrecVariant{ck_tile::half_t{}};
+    }
+    else if(datatype == "bf16")
+    {
+        return PrecVariant{ck_tile::bf16_t{}};
+    }
+    else if(datatype == "fp32")
+    {
+        return PrecVariant{float{}};
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type: " + datatype);
+    }
+};
diff --git a/example/ck_tile/21_elementwise/elementwise_example.cpp b/example/ck_tile/21_elementwise/elementwise_example.cpp
index 2cc539e117..e9fbeafde1 100644
--- a/example/ck_tile/21_elementwise/elementwise_example.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example.cpp
@@ -1,10 +1,11 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/elementwise.hpp"
 #include "ck_tile/host/reference/reference_elementwise.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+#include "elementwise_common.hpp"
 
 auto create_args(int argc, char* argv[])
 {
@@ -13,15 +14,21 @@ auto create_args(int argc, char* argv[])
         .insert("n", "1024", "n dimension")
         .insert("stride", "-1", "stride per row, if -1 then equal to n")
         .insert("v", "1", "cpu validation or not")
-        .insert("prec", "fp16", "precision")
+        .insert("x_prec", "fp16", "input precision, fp16/bf16/fp32")
+        .insert("y_prec", "fp16", "output precision, fp16/bf16/fp32")
         .insert("warmup", "10", "cold iter")
-        .insert("repeat", "50", "hot iter");
+        .insert("repeat", "50", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "elementwise.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
 
-template <typename DataType>
+// XDataType: Data type of the input tensors.
+// ComputeDataType: Data type used for intermediate computations (often float for precision).
+// YDataType: Data type of the output tensor.
+template <typename XDataType, typename YDataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
     ck_tile::index_t M      = arg_parser.get_int("m");
@@ -31,25 +38,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
     // If stride is negative (default -1), set it to N, assuming a dense row-major layout.
     if(stride < 0)
         stride = N;
-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");
 
     if(stride < N)
     {
         throw std::runtime_error("stride must be >= N");
     }
 
-    // Define type aliases for clarity.
-    // XDataType: Data type of the input tensors.
-    // ComputeDataType: Data type used for intermediate computations (often float for precision).
-    // YDataType: Data type of the output tensor.
     // XElementwiseOperation: The specific elementwise operation to perform (e.g., Add, Mul).
-    using XDataType = DataType;
     using ComputeDataType =
         float; // Using float for intermediate calculations can improve numerical stability.
-    using YDataType             = DataType;
     using XElementwiseOperation = ck_tile::element_wise::Add;
 
     // 1. Initialize the input data on the host (CPU).
@@ -137,8 +137,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     // This is often a multiple of the wavefront size, 64 on CDNA.
     // Here, it's explicitly set to 512. This should be consistent with Shape::kBlockSize.
     // Shape::kBlockSize would be BlockWarps * warpSize (e.g., 8 * 64 = 512).
-    constexpr ck_tile::index_t kBlockSize =
-        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+    const ck_tile::index_t kBlockSize = Kernel::BlockSize();
 
     // kBlockPerCu: Hint for how many workgroups can be scheduled per Compute Unit (CU).
     // This can influence occupancy and performance.
@@ -195,20 +194,45 @@ bool run(const ck_tile::ArgParser& arg_parser)
             y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_elementwise_json_results(arg_parser.get_str("jsonfile"),
+                                      arg_parser.get_str("prec"),
+                                      kGridSize,
+                                      kBlockSize,
+                                      ave_time,
+                                      0,
+                                      0,
+                                      "elementwise_add");
+    }
+
     return pass;
 }
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    bool result = true;
+    ck_tile::ArgParser arg_parser;
+    std::tie(result, arg_parser) = create_args(argc, argv);
     if(!result)
         return -1;
 
-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
+    try
     {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
+        const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
+        return std::visit(
+            [&](auto&& x_dt, auto&& y_dt) -> int {
+                using XDataType = std::decay_t<decltype(x_dt)>;
+                using YDataType = std::decay_t<decltype(y_dt)>;
+                return run<XDataType, YDataType>(arg_parser);
+            },
+            x_prec_variant,
+            y_prec_variant);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -3;
     }
-
-    return -3;
 }
diff --git a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
index 7087d092a2..1b101c2e5f 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
@@ -1,10 +1,11 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/elementwise.hpp"
 #include "ck_tile/host/reference/reference_elementwise.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+#include "elementwise_common.hpp"
 
 auto create_args(int argc, char* argv[])
 {
@@ -14,15 +15,18 @@ auto create_args(int argc, char* argv[])
         .insert("dim2", "32", "dimension 2")
         .insert("dim3", "32", "dimension 3")
         .insert("v", "1", "cpu validation or not")
-        .insert("prec", "fp16", "precision")
+        .insert("x_prec", "fp16", "input precision")
+        .insert("y_prec", "fp16", "output precision")
         .insert("warmup", "10", "cold iter")
-        .insert("repeat", "50", "hot iter");
+        .insert("repeat", "50", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "elementwise_add_4d.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
 
-template <typename DataType>
+template <typename XDataType, typename YDataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
     ck_tile::index_t D0 = arg_parser.get_int("dim0");
@@ -30,15 +34,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::index_t D2 = arg_parser.get_int("dim2");
     ck_tile::index_t D3 = arg_parser.get_int("dim3");
 
-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");
 
-    using XDataType = DataType;
     using ComputeDataType =
         float; // Using float for intermediate calculations can improve numerical stability.
-    using YDataType             = DataType;
     using XElementwiseOperation = ck_tile::element_wise::Add;
 
     // Initialize the input data on the host (CPU).
@@ -83,8 +84,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     for(auto d : problem_shape)
         total_elements *= d;
 
-    constexpr ck_tile::index_t kBlockSize =
-        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+    const ck_tile::index_t kBlockSize = Kernel::BlockSize();
 
     constexpr ck_tile::index_t kBlockPerCu = 2;
 
@@ -140,20 +140,45 @@ bool run(const ck_tile::ArgParser& arg_parser)
             y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_elementwise_json_results(arg_parser.get_str("jsonfile"),
+                                      arg_parser.get_str("prec"),
+                                      kGridSize,
+                                      kBlockSize,
+                                      ave_time,
+                                      0,
+                                      0,
+                                      "elementwise_add_4d");
+    }
+
     return pass;
 }
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    bool result = true;
+    ck_tile::ArgParser arg_parser;
+    std::tie(result, arg_parser) = create_args(argc, argv);
     if(!result)
         return -1;
 
-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
+    try
     {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
+        const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
+        return std::visit(
+            [&](auto&& x_dt, auto&& y_dt) -> int {
+                using XDataType = std::decay_t<decltype(x_dt)>;
+                using YDataType = std::decay_t<decltype(y_dt)>;
+                return run<XDataType, YDataType>(arg_parser);
+            },
+            x_prec_variant,
+            y_prec_variant);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -3;
     }
-
-    return -3;
 }
diff --git a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
index 28cdaf27b9..7cdb5cc0d1 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
@@ -4,6 +4,8 @@
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/elementwise.hpp"
 #include "ck_tile/host/reference/reference_transpose.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+#include "elementwise_common.hpp"
 
 auto create_args(int argc, char* argv[])
 {
@@ -14,7 +16,9 @@ auto create_args(int argc, char* argv[])
         .insert("v", "1", "cpu validation or not")
         .insert("prec", "fp16", "precision")
         .insert("warmup", "10", "cold iter")
-        .insert("repeat", "50", "hot iter");
+        .insert("repeat", "50", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "elementwise_transpose.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -29,10 +33,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     if(stride_in < 0)
         stride_in = N; // Dense input: stride for M dim is N
-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");
 
     if(stride_in < N)
     {
@@ -86,8 +89,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::index_t total_elements = M * N;
 
-    constexpr ck_tile::index_t kBlockSize =
-        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+    const ck_tile::index_t kBlockSize             = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu        = 1;
     constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
     ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
@@ -137,21 +139,42 @@ bool run(const ck_tile::ArgParser& arg_parser)
             y_validation, y_host, "Transpose Error: Incorrect results!", 0.01, 0.01);
     }
 
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_elementwise_json_results(arg_parser.get_str("jsonfile"),
+                                      arg_parser.get_str("prec"),
+                                      kGridSize,
+                                      kBlockSize,
+                                      ave_time,
+                                      0,
+                                      0,
+                                      "elementwise_transpose");
+    }
+
     return pass;
 }
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    bool result = true;
+    ck_tile::ArgParser arg_parser;
+    std::tie(result, arg_parser) = create_args(argc, argv);
     if(!result)
         return -1;
 
-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
+    try
     {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        const auto prec_variant = string_to_datatype(arg_parser.get_str("prec"));
+        return std::visit(
+            [&](auto&& dt) -> int {
+                using DataType = std::decay_t<decltype(dt)>;
+                return run<DataType>(arg_parser);
+            },
+            prec_variant);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -3;
     }
-
-    std::cerr << "Unsupported data type: " << data_type << std::endl;
-    return -3;
 }
diff --git a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
index 782d3da24d..4e19cfd688 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
@@ -1,10 +1,11 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/elementwise.hpp"
 #include "ck_tile/host/reference/reference_elementwise.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+#include "elementwise_common.hpp"
 
 auto create_args(int argc, char* argv[])
 {
@@ -13,15 +14,19 @@ auto create_args(int argc, char* argv[])
         .insert("n", "1024", "n dimension")
         .insert("stride", "-1", "stride per row, if -1 then equal to n")
         .insert("v", "1", "cpu validation or not")
-        .insert("prec", "fp16", "precision")
+        .insert("op", "1", "unary operation, 1: square, 2: convert")
+        .insert("x_prec", "fp16", "input precision")
+        .insert("y_prec", "fp16", "output precision")
         .insert("warmup", "10", "cold iter")
-        .insert("repeat", "50", "hot iter");
+        .insert("repeat", "50", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "elementwise_unary.json", "json file name to dump results");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
 
-template <typename DataType>
+template <typename XElementwiseOperation, typename XDataType, typename YDataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
     ck_tile::index_t M      = arg_parser.get_int("m");
@@ -29,17 +34,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::index_t stride = arg_parser.get_int("stride");
     if(stride < 0)
         stride = N;
-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");
 
     assert(stride >= N);
 
-    using XDataType             = DataType;
-    using YDataType             = DataType;
-    using XElementwiseOperation = ck_tile::element_wise::UnarySquare;
-
     // 1. Initialize the input data on the host
     ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
     ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
@@ -78,8 +78,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     for(auto d : shape)
         total_elements *= d;
 
-    constexpr ck_tile::index_t kBlockSize =
-        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+    const ck_tile::index_t kBlockSize      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
@@ -119,28 +118,106 @@ bool run(const ck_tile::ArgParser& arg_parser)
     {
         y_buf.FromDevice(y_validation.data());
 
-        auto op = [](const auto& v0) { return v0 * v0; };
+        auto op = [](const XDataType& v0) -> YDataType {
+            XElementwiseOperation element_op{};
+            YDataType result;
+            element_op(result, v0);
+            return result;
+        };
 
         ck_tile::reference_unary_elementwise<XDataType, YDataType, YDataType>(x_host_a, y_host, op);
 
         pass = ck_tile::check_err(
-            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+            y_validation, y_host, "Elementwise unary op: Incorrect results!", 0.01, 0.01);
+    }
+
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_elementwise_json_results(arg_parser.get_str("jsonfile"),
+                                      arg_parser.get_str("prec"),
+                                      kGridSize,
+                                      kBlockSize,
+                                      ave_time,
+                                      0,
+                                      0,
+                                      "elementwise_unary");
     }
 
     return pass;
 }
 
+template <typename XElementwiseOperation, typename XDataType, typename YDataType>
+bool filter_then_run(const ck_tile::ArgParser& arg_parser)
+{
+    auto throw_unsupported = [&]() {
+        const auto x_prec = arg_parser.get_str("x_prec");
+        const auto op     = arg_parser.get_str("op");
+        throw std::runtime_error("Unsupported! x_prec: " + x_prec + ", op: " + op);
+    };
+    bool pass = true;
+
+    if constexpr(std::is_same_v<XElementwiseOperation, ck_tile::element_wise::UnarySquare> &&
+                 (std::is_same_v<XDataType, ck_tile::bf16_t> ||
+                  std::is_same_v<YDataType, ck_tile::bf16_t>))
+    {
+        throw_unsupported();
+    }
+    else if constexpr(std::is_same_v<XElementwiseOperation, ck_tile::element_wise::UnaryConvert> &&
+                      (std::is_same_v<XDataType, ck_tile::bf16_t> ||
+                       std::is_same_v<YDataType, ck_tile::bf16_t>))
+    {
+        throw_unsupported();
+    }
+    else
+    {
+        pass = run<XElementwiseOperation, XDataType, YDataType>(arg_parser);
+    }
+
+    return pass;
+}
+
+auto string_to_op(const std::string& op)
+{
+    using OpVariant =
+        std::variant<ck_tile::element_wise::UnarySquare, ck_tile::element_wise::UnaryConvert>;
+
+    if(op == "1")
+        return OpVariant{ck_tile::element_wise::UnarySquare{}};
+    else if(op == "2")
+        return OpVariant{ck_tile::element_wise::UnaryConvert{}};
+    else
+    {
+        throw std::runtime_error("Unsupported unary operation: " + op);
+    }
+};
+
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    bool result = true;
+    ck_tile::ArgParser arg_parser;
+    std::tie(result, arg_parser) = create_args(argc, argv);
     if(!result)
         return -1;
 
-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
+    try
     {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
+        const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
+        const auto op_variant     = string_to_op(arg_parser.get_str("op"));
+        return std::visit(
+            [&](auto&& op, auto&& x_dt, auto&& y_dt) -> int {
+                using XElementwiseOperation = std::decay_t<decltype(op)>;
+                using XDataType             = std::decay_t<decltype(x_dt)>;
+                using YDataType             = std::decay_t<decltype(y_dt)>;
+                return filter_then_run<XElementwiseOperation, XDataType, YDataType>(arg_parser);
+            },
+            op_variant,
+            x_prec_variant,
+            y_prec_variant);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -3;
     }
-
-    return -3;
 }
diff --git a/example/ck_tile/22_gemm_multi_abd/CMakeLists.txt b/example/ck_tile/22_gemm_multi_abd/CMakeLists.txt
new file mode 100644
index 0000000000..f382e0cf45
--- /dev/null
+++ b/example/ck_tile/22_gemm_multi_abd/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(tile_example_gemm_multi_abd_fp16 EXCLUDE_FROM_ALL gemm_multi_abd_fp16.cpp)
diff --git a/example/ck_tile/22_gemm_multi_abd/README.md b/example/ck_tile/22_gemm_multi_abd/README.md
new file mode 100644
index 0000000000..c272df3fb5
--- /dev/null
+++ b/example/ck_tile/22_gemm_multi_abd/README.md
@@ -0,0 +1,35 @@
+#Multiple ABD GEMM
+
+This folder contains example for Multiple ABD GEMM using ck_tile tile-programming implementation.
+
+## build
+```
+#in the root of ck_tile
+mkdir build && cd build
+#you can replace < arch> with the appropriate architecture(for example gfx90a or gfx942) or \
+    leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+#The basic pipeline method on the gemm calculation
+make tile_example_gemm_multi_abd_fp16 -j
+```
+This will result in an executable `build/bin/tile_example_gemm_multi_abd_fp16`
+
+## example
+```
+args:
+       -m  M dimensions - (Default: 3840)
+       -n  N dimensions - (Default: 4096)
+       -k  K dimensions - (Default: 4096)
+-as_layout  Tensor A layout (default:R)
+-bs_layout  Tensor B layout (default:C)
+-ds_layout  Tensor D layout (default:R)
+-e_layout   Tensor E layout (default:R)
+-stride_as  Tensor A strides - (Default: 0)
+-stride_bs  Tensor B strides - (Default: 0)
+-stride_e   Tensor C strides - (Default: 0)
+-stride_ds  Tensor D strides - (Default: 0)
+-validate   0. No validation, 1. Validation on GPU. (Default: 1)
+  -warmup   Number of iterations before benchmark the kernel. (Default: 10)
+  -repeat   Number of iterations to benchmark the kernel. (Default: 100)
+  -kbatch   kbatch for SplitK. (Default: 1)
+```
\ No newline at end of file
diff --git a/example/ck_tile/22_gemm_multi_abd/gemm_multi_abd_fp16.cpp b/example/ck_tile/22_gemm_multi_abd/gemm_multi_abd_fp16.cpp
new file mode 100644
index 0000000000..6d955c3a09
--- /dev/null
+++ b/example/ck_tile/22_gemm_multi_abd/gemm_multi_abd_fp16.cpp
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <memory>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_multi_abd_fp16.hpp"
+#include "utils.hpp"
+
+template <typename GemmConfig,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AElementWise   = ck_tile::element_wise::PassThrough,
+          typename BElementWise   = ck_tile::element_wise::PassThrough,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+auto gemm_multi_abd(const gemm_multi_abd_kargs& args, const ck_tile::stream_config& s) -> float
+{
+    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
+    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
+    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
+
+    constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp;
+    constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp;
+    constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp;
+
+    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
+    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
+    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
+
+    constexpr bool DoubleSmemBuffer = GemmConfig::DoubleSmemBuffer;
+    constexpr bool kPadM            = false;
+    constexpr bool kPadN            = false;
+    constexpr bool kPadK            = false;
+
+    constexpr bool TransposeC = false;
+
+    constexpr int kBlockPerCu                         = 1;
+    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+    using GemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::
+        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, AsLayout, BsLayout, ELayout>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                 kPadN,
+                                                                 kPadK,
+                                                                 DoubleSmemBuffer,
+                                                                 AsLayout,
+                                                                 BsLayout,
+                                                                 ELayout,
+                                                                 TransposeC>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<AsDataType, BsDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GemmConfig::Scheduler;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<AsDataType,
+                                                                               BsDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v,
+                                                                               AElementWise,
+                                                                               BElementWise>;
+
+            using GemmPipeline = typename PipelineTypeTraits<
+                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 EDataType,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 CDEElementWise,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+
+            using Kernel = ck_tile::GemmKernelMultiABD<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args:" << " grid: {" << grids.x << ", "
+                          << grids.y << ", " << grids.z << "}" << ", blocks: {" << blocks.x << ", "
+                          << blocks.y << ", " << blocks.z << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            return ave_time;
+        };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+
+    return ave_time;
+}
+
+#include "run_gemm_multi_abd_fp16_example.inc"
+
+int main(int argc, char* argv[])
+{
+#if CK_TILE_USE_WMMA
+    return !run_multiple_abd_gemm_example<GemmConfigV3_Wmma>(argc, argv);
+#else
+    return !run_multiple_abd_gemm_example<GemmConfigV3>(argc, argv);
+#endif
+}
diff --git a/example/ck_tile/22_gemm_multi_abd/gemm_multi_abd_fp16.hpp b/example/ck_tile/22_gemm_multi_abd/gemm_multi_abd_fp16.hpp
new file mode 100644
index 0000000000..35bc232eca
--- /dev/null
+++ b/example/ck_tile/22_gemm_multi_abd/gemm_multi_abd_fp16.hpp
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
+#endif
+
+using A0DataType = ck_tile::half_t;
+using A1DataType = ck_tile::half_t;
+
+using B0DataType = ck_tile::half_t;
+using B1DataType = ck_tile::half_t;
+
+using D0DataType = ck_tile::half_t;
+using D1DataType = ck_tile::half_t;
+
+using EDataType = ck_tile::half_t;
+
+using AsDataType = ck_tile::tuple<A0DataType, A1DataType>;
+using BsDataType = ck_tile::tuple<B0DataType, B1DataType>;
+using DsDataType = ck_tile::tuple<D0DataType, D1DataType>;
+
+using AccDataType = float;
+
+struct GemmConfigMemory
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+struct GemmConfigV3
+{
+    // Compute friendly for Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+struct GemmConfigV4
+{
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 32;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+struct GemmConfigV3_Wmma
+{
+    // Compute friendly for Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "4096", "k dimension")
+        .insert("as_layout", "R", "As tensor data layout - Row by default")
+        .insert("bs_layout", "C", "Bs tensor data layout - Col by default")
+        .insert("ds_layout", "R", "Ds tensor data layout - Row by default")
+        .insert("e_layout", "R", "E tensor data layout - Row by default")
+        .insert("stride_as", "0", "Tensor A stride")
+        .insert("stride_bs", "0", "Tensor B stride")
+        .insert("stride_ds", "0", "Tensor Ds stride")
+        .insert("stride_e", "0", "Tensor E stride")
+        .insert("v", "1", "0. No validation, 1. Validation on GPU")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("kbatch", "1", "kbatch for SplitK");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+using gemm_multi_abd_kargs =
+    ck_tile::GemmMultiABDHostArgs<AsDataType::size(), BsDataType::size(), DsDataType::size()>;
+
+template <typename GemmConfig,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename AElementWise,
+          typename BElementWise,
+          typename CDEElementWise>
+float gemm_multi_abd(const gemm_multi_abd_kargs& kargs, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/22_gemm_multi_abd/run_gemm_multi_abd_fp16_example.inc b/example/ck_tile/22_gemm_multi_abd/run_gemm_multi_abd_fp16_example.inc
new file mode 100644
index 0000000000..881961c9db
--- /dev/null
+++ b/example/ck_tile/22_gemm_multi_abd/run_gemm_multi_abd_fp16_example.inc
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <cstddef>
+
+template <typename GemmConfig,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AElementWise   = ck_tile::element_wise::PassThrough,
+          typename BElementWise   = ck_tile::element_wise::PassThrough,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm_multi_abd(const std::array<const void*, AsDataType::size()>& as_m_k_dev_buf,
+                            const std::array<const void*, BsDataType::size()>& bs_k_n_dev_buf,
+                            const std::array<const void*, DsDataType::size()>& ds_m_n_dev_buf,
+                            void* e_m_n_dev_buf,
+                            ck_tile::index_t M,
+                            ck_tile::index_t N,
+                            ck_tile::index_t K,
+                            const std::array<ck_tile::index_t, AsDataType::size()>& StrideAs,
+                            const std::array<ck_tile::index_t, BsDataType::size()>& StrideBs,
+                            const std::array<ck_tile::index_t, DsDataType::size()>& StrideDs,
+                            ck_tile::index_t StrideE,
+                            int n_warmup,
+                            int n_repeat,
+                            int k_batch)
+{
+    gemm_multi_abd_kargs gemm_descs({as_m_k_dev_buf,
+                                     bs_k_n_dev_buf,
+                                     ds_m_n_dev_buf,
+                                     e_m_n_dev_buf,
+                                     k_batch,
+                                     M,
+                                     N,
+                                     K,
+                                     StrideAs,
+                                     StrideBs,
+                                     StrideDs,
+                                     StrideE});
+
+    float ave_time = gemm_multi_abd<GemmConfig,
+                                    AsDataType,
+                                    BsDataType,
+                                    DsDataType,
+                                    AccDataType,
+                                    EDataType,
+                                    AsLayout,
+                                    BsLayout,
+                                    DsLayout,
+                                    ELayout,
+                                    AElementWise,
+                                    BElementWise,
+                                    CDEElementWise>(
+        gemm_descs, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::string op_name{"Gemm Multiple-ABD"};
+
+    std::size_t flop = 0, num_btype = 0;
+
+    flop += std::size_t(2) * M * N * K;
+
+    num_btype +=
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm Multiple-ABD kernel with:\n";
+    std::cout << "M =" << M << " N =" << N << " K =" << K << "\n";
+    std::cout << "StrideA = " << StrideAs[0] << " StrideB = " << StrideBs[0]
+              << " StrideE = " << StrideE << "\n";
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << "\n";
+
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename A0Layout,
+          typename A1Layout,
+          typename B0Layout,
+          typename B1Layout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout>
+int run_gemm_multi_abd_example_with_layouts(int argc,
+                                            char* argv[],
+                                            const A0Layout a0_layout = A0Layout{},
+                                            const A1Layout a1_layout = A1Layout{},
+                                            const B0Layout b0_layout = B0Layout{},
+                                            const B1Layout b1_layout = B1Layout{},
+                                            const D0Layout d0_layout = D0Layout{},
+                                            const D1Layout d1_layout = D1Layout{},
+                                            const ELayout e_layout   = ELayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+    using AElementWiseFn   = ck_tile::element_wise::AddScale;
+    using BElementWiseFn   = ck_tile::element_wise::AddScale;
+    using CDEElementWiseFn = ck_tile::element_wise::MultiDMultiply;
+    using AsLayout         = ck_tile::tuple<A0Layout, A1Layout>;
+    using BsLayout         = ck_tile::tuple<B0Layout, B1Layout>;
+    using DsLayout         = ck_tile::tuple<D0Layout, D1Layout>;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t StrideA = arg_parser.get_int("stride_as");
+    ck_tile::index_t StrideB = arg_parser.get_int("stride_bs");
+    ck_tile::index_t StrideD = arg_parser.get_int("stride_ds");
+    ck_tile::index_t StrideE = arg_parser.get_int("stride_e");
+
+    ck_tile::index_t StrideA0 = StrideA;
+    ck_tile::index_t StrideA1 = StrideA;
+
+    ck_tile::index_t StrideB0 = StrideB;
+    ck_tile::index_t StrideB1 = StrideB;
+
+    ck_tile::index_t StrideD0 = StrideD;
+    ck_tile::index_t StrideD1 = StrideD;
+
+    const int n_warmup = arg_parser.get_int("warmup");
+    const int n_repeat = arg_parser.get_int("repeat");
+    const int k_batch  = arg_parser.get_int("kbatch");
+
+    StrideA0 = get_default_stride(M, N, StrideA0, is_row_major(a1_layout));
+    StrideA1 = get_default_stride(M, N, StrideA1, is_row_major(a1_layout));
+
+    StrideB0 = get_default_stride(K, N, StrideB0, is_row_major(b0_layout));
+    StrideB1 = get_default_stride(K, N, StrideB1, is_row_major(b1_layout));
+
+    StrideD0 = get_default_stride(M, N, StrideD0, is_row_major(d0_layout));
+    StrideD1 = get_default_stride(M, N, StrideD1, is_row_major(d1_layout));
+
+    StrideE = get_default_stride(M, N, StrideE, is_row_major(e_layout));
+
+    ck_tile::HostTensor<A0DataType> a0_m_k_tesnor(
+        host_tensor_descriptor(M, K, StrideA0, is_row_major(a0_layout)));
+    ck_tile::HostTensor<A1DataType> a1_m_k_tesnor(
+        host_tensor_descriptor(M, K, StrideA1, is_row_major(a1_layout)));
+
+    ck_tile::HostTensor<B0DataType> b0_k_n_tensors(
+        host_tensor_descriptor(K, N, StrideB0, is_row_major(b0_layout)));
+    ck_tile::HostTensor<B1DataType> b1_k_n_tensors(
+        host_tensor_descriptor(K, N, StrideB1, is_row_major(b1_layout)));
+
+    ck_tile::HostTensor<D0DataType> d0_m_n_tensors(
+        host_tensor_descriptor(M, N, StrideD0, is_row_major(d0_layout)));
+    ck_tile::HostTensor<D1DataType> d1_m_n_tensors(
+        host_tensor_descriptor(M, N, StrideD1, is_row_major(d1_layout)));
+
+    ck_tile::HostTensor<EDataType> e_m_n_device_result(
+        host_tensor_descriptor(M, N, StrideE, is_row_major(e_layout)));
+
+    ck_tile::FillUniformDistribution<A0DataType>{-1.f, 1.f}(a0_m_k_tesnor);
+    ck_tile::FillUniformDistribution<A1DataType>{-1.f, 1.f}(a1_m_k_tesnor);
+
+    ck_tile::FillUniformDistribution<B0DataType>{-1.f, 1.f}(b0_k_n_tensors);
+    ck_tile::FillUniformDistribution<B1DataType>{-1.f, 1.f}(b1_k_n_tensors);
+
+    ck_tile::FillUniformDistribution<D0DataType>{-1.f, 1.f}(d0_m_n_tensors);
+    ck_tile::FillUniformDistribution<D1DataType>{-1.f, 1.f}(d1_m_n_tensors);
+
+    ck_tile::DeviceMem a0_m_k_dev_buf(a0_m_k_tesnor.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem a1_m_k_dev_buf(a1_m_k_tesnor.get_element_space_size_in_bytes());
+
+    ck_tile::DeviceMem b0_k_n_dev_buf(b0_k_n_tensors.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b1_k_n_dev_buf(b1_k_n_tensors.get_element_space_size_in_bytes());
+
+    ck_tile::DeviceMem d0_m_n_dev_buf(d0_m_n_tensors.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d1_m_n_dev_buf(d1_m_n_tensors.get_element_space_size_in_bytes());
+
+    ck_tile::DeviceMem e_m_n_dev_buf(e_m_n_device_result.get_element_space_size_in_bytes());
+
+    a0_m_k_dev_buf.ToDevice(a0_m_k_tesnor.mData.data());
+    a1_m_k_dev_buf.ToDevice(a1_m_k_tesnor.mData.data());
+
+    b0_k_n_dev_buf.ToDevice(b0_k_n_tensors.mData.data());
+    b1_k_n_dev_buf.ToDevice(b1_k_n_tensors.mData.data());
+
+    d0_m_n_dev_buf.ToDevice(d0_m_n_tensors.mData.data());
+    d1_m_n_dev_buf.ToDevice(d1_m_n_tensors.mData.data());
+
+    e_m_n_dev_buf.SetZero();
+    e_m_n_device_result.SetZero();
+
+    std::array<const void*, DsDataType::size()> as_ptr_buf = {a0_m_k_dev_buf.GetDeviceBuffer(),
+                                                              a1_m_k_dev_buf.GetDeviceBuffer()};
+
+    std::array<const void*, DsDataType::size()> bs_ptr_buf = {b0_k_n_dev_buf.GetDeviceBuffer(),
+                                                              b1_k_n_dev_buf.GetDeviceBuffer()};
+
+    std::array<const void*, DsDataType::size()> ds_ptr_buf = {d0_m_n_dev_buf.GetDeviceBuffer(),
+                                                              d1_m_n_dev_buf.GetDeviceBuffer()};
+
+    std::array<ck_tile::index_t, AsDataType::size()> strideAs = {StrideA0, StrideA1};
+    std::array<ck_tile::index_t, BsDataType::size()> strideBs = {StrideB0, StrideB1};
+    std::array<ck_tile::index_t, DsDataType::size()> strideDs = {StrideD0, StrideD1};
+
+    invoke_gemm_multi_abd<GemmConfig,
+                          AsDataType,
+                          BsDataType,
+                          DsDataType,
+                          AccDataType,
+                          EDataType,
+                          AsLayout,
+                          BsLayout,
+                          DsLayout,
+                          ELayout,
+                          AElementWiseFn,
+                          BElementWiseFn,
+                          CDEElementWiseFn>(as_ptr_buf,
+                                            bs_ptr_buf,
+                                            ds_ptr_buf,
+                                            e_m_n_dev_buf.GetDeviceBuffer(),
+                                            M,
+                                            N,
+                                            K,
+                                            strideAs,
+                                            strideBs,
+                                            strideDs,
+                                            StrideE,
+                                            n_warmup,
+                                            n_repeat,
+                                            k_batch);
+
+    e_m_n_dev_buf.FromDevice(e_m_n_device_result.data());
+
+    ck_tile::HostTensor<A0DataType> a_m_k_host_ref_element_result(
+        host_tensor_descriptor(M, K, StrideA0, is_row_major(a0_layout)));
+    ck_tile::HostTensor<B0DataType> b_k_n_host_ref_element_result(
+        host_tensor_descriptor(K, N, StrideB0, is_row_major(b0_layout)));
+    ck_tile::HostTensor<EDataType> e_m_n_host_ref(
+        host_tensor_descriptor(M, N, StrideE, is_row_major(e_layout)));
+    a_m_k_host_ref_element_result.SetZero();
+    b_k_n_host_ref_element_result.SetZero();
+    e_m_n_host_ref.SetZero();
+
+    ck_tile::reference_gemm_multiple_abd<AsDataType,
+                                         BsDataType,
+                                         DsDataType,
+                                         AccDataType,
+                                         EDataType,
+                                         AElementWiseFn,
+                                         BElementWiseFn,
+                                         CDEElementWiseFn>({a0_m_k_tesnor, a1_m_k_tesnor},
+                                                           {b0_k_n_tensors, b1_k_n_tensors},
+                                                           {d0_m_n_tensors, d1_m_n_tensors},
+                                                           a_m_k_host_ref_element_result,
+                                                           b_k_n_host_ref_element_result,
+                                                           e_m_n_host_ref);
+
+    bool pass{true};
+    if(arg_parser.get_int("v"))
+    {
+        const float max_accumulated_value =
+            *std::max_element(e_m_n_host_ref.mData.begin(), e_m_n_host_ref.mData.end());
+
+        const auto rtol_atol = calculate_rtol_atol(K, 1, max_accumulated_value);
+
+        pass &= ck_tile::check_err(e_m_n_device_result,
+                                   e_m_n_host_ref,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << std::endl;
+        std::cout << "Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+    return pass;
+}
+
+template <typename GemmConfig>
+int run_multiple_abd_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+
+    const std::string as_layout = arg_parser.get_str("as_layout");
+    const std::string bs_layout = arg_parser.get_str("bs_layout");
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if(as_layout == "R" && bs_layout == "C")
+    {
+        return run_gemm_multi_abd_example_with_layouts<GemmConfig>(
+            argc, argv, Row{}, Row{}, Col{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
diff --git a/example/ck_tile/22_gemm_multi_abd/utils.hpp b/example/ck_tile/22_gemm_multi_abd/utils.hpp
new file mode 100644
index 0000000000..38bf8623d4
--- /dev/null
+++ b/example/ck_tile/22_gemm_multi_abd/utils.hpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(A0DataType) < sizeof(B0DataType), A0DataType, B0DataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+
+    const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+        max_accumulated_value, kbatch);
+
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
diff --git a/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp b/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
index 571386694b..79013594ec 100644
--- a/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
@@ -12,6 +12,7 @@
 
 #include "batched_transpose_example.hpp"
 
+#include "ck_tile/utility/json_dump.hpp"
 #if 0
 template <typename T>
 void dump_host_tensor_4d(const ck_tile::HostTensor<T>& x)
@@ -103,6 +104,8 @@ auto create_args(int argc, char* argv[])
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("seed", "-1", "seed to be used, -1 means random every time")
         .insert("kname", "0", "t to 1 will print kernel name")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "batched_transpose.json", "json file name to dump results")
         .insert("pipeline", "0", "0: no LDS usage, 1: LDS-accelerated (gfx950)");
 
     bool result = arg_parser.parse(argc, argv);
@@ -236,6 +239,23 @@ bool run_batched_transpose(ck_tile::ArgParser args)
            "--------------------------------------------------------------------\n",
            rtn ? "y" : "n");
     fflush(stdout);
+
+    if(args.get_int("json") == 1)
+    {
+        dump_batched_transpose_json(args.get_str("jsonfile"),
+                                    N,
+                                    C,
+                                    H,
+                                    W,
+                                    layout_in,
+                                    layout_out,
+                                    prec,
+                                    ms,
+                                    0,
+                                    gb_per_sec,
+                                    rtn);
+    }
+
     return rtn;
 }
 
diff --git a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
index 12cf874c73..7358d4d749 100644
--- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
+++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
@@ -6,10 +6,8 @@ endif()
 list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
 
 if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
-    add_executable(tile_example_gemm_aquant_basic EXCLUDE_FROM_ALL gemm_aquant_basic.cpp)
-    target_compile_options(tile_example_gemm_aquant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    add_executable(tile_example_gemm_bquant_basic EXCLUDE_FROM_ALL gemm_bquant_basic.cpp)
-    target_compile_options(tile_example_gemm_bquant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    add_executable(tile_example_gemm_quant_basic EXCLUDE_FROM_ALL gemm_quant_basic.cpp)
+    target_compile_options(tile_example_gemm_quant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 else()
     message(DEBUG "Skipping ck_tile quant gemm tests for current target")
 endif()
diff --git a/example/ck_tile/38_block_scale_gemm/README.md b/example/ck_tile/38_block_scale_gemm/README.md
index 6d6aec28c8..7f8aba7b3d 100644
--- a/example/ck_tile/38_block_scale_gemm/README.md
+++ b/example/ck_tile/38_block_scale_gemm/README.md
@@ -1,18 +1,22 @@
-# GEMM Matrix Multiplication
+# Quant GEMM Matrix Multiplication
 
-This folder contains example for Block Scale GEMM using ck_tile tile-programming implementation. 
+This folder contains examples of quant GEMMs using the ck_tile tile-programming implementation.
+
+- AQuant kernel with blocks of A matrix sharing scales: custom GEMM pipeline
+- BQuant kernel with blocks of B matrix sharing scales: custom GEMM pipeline
+- Row and Column-wise scaled: scaling implemented in Epilogue
+- Tensor-wise scaled: scaling implemented in Epilogue
 
 ## build
 ```
 # in the root of ck_tile
 mkdir build && cd build
-# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+# you can replace <arch> with the appropriate architecture (for example gfx942) or leave it blank
 ../script/cmake-ck-dev.sh  ../ <arch>
-# The aquant pipeline method on the gemm calculation
-make tile_example_gemm_aquant_basic -j
-make tile_example_gemm_bquant_basic -j
+# Compile the quant kernels
+make tile_example_gemm_quant_basic -j
 ```
-This will result in an executable `build/bin/tile_example_gemm_aquant_basic`
+This will result in an executable `build/bin/tile_example_gemm_quant_basic`
 
 ## example
 ```
@@ -22,15 +26,27 @@ args:
           -n    n dimension (default:2048)
           -k    k dimension (default:64)
    -a_layout    Tensor A data layout (default: R)
-   -b_layout    Tensor B data layout (default: R)
+   -b_layout    Tensor B data layout (default: C)
    -c_layout    Tensor C data layout (default: R)
    -stride_a    Tensor A stride (default:0)
    -stride_b    Tensor B stride (default:0)
    -stride_c    Tensor C stride (default:0)
-          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:1)
           -e    Absolute error tolerance (default:1e-5)
-       -prec    data type. fp16/bf16/fp8/bf8/int8 (default:fp16)
+       -prec    data type. fp8/bf8/i4fp8/i4bf8/i4f32fp8/i4f32bf8 (default:fp8)
      -warmup    number of iterations before benchmark the kernel (default:10)
      -repeat    number of iterations to benchmark the kernel (default:100)
       -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+ -quant_mode    Which quant method to use (aquant, bquant, tensor, rowcol)
 ```
+
+User need to select correct mapping of config for each quant mode:
+
+|  | quant_mode as runtime argument | Config in cpp file |
+|:--------|:-----:|-------|
+| For selecting AQuant  | aquant  | GemmConfigQuant    |
+| For selecting Aquant with Preshuffle   | aquant  | GemmConfigPreshuffleQuant    |
+| For selecting BQuant  | bquant  | GemmConfigQuant    |
+| For selecting PreShuffle Weight matrix with Bquant | bquant | GemmConfigPreshuffleB_Bquant_decode (or) GemmConfigPreshuffleB_Bquant_prefill
+| For selecting RowCol quant  | rowcolquant  | GemmConfigRowColQuant    |
+
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
deleted file mode 100644
index d5a38fe754..0000000000
--- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <stdexcept>
-#include <string>
-#include <tuple>
-
-#include "gemm_utils.hpp"
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename AQDataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ComputeDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          uint32_t QuantGroupSize>
-float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
-{
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
-
-    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
-    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
-    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
-
-    constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp;
-    constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp;
-    constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
-
-    using CodegenGemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
-
-    using CodegenGemmTraits = ck_tile::TileGemmAQuantTraits<kPadM,
-                                                            kPadN,
-                                                            kPadK,
-                                                            GemmConfig::PreshuffleQuant,
-                                                            ALayout,
-                                                            BLayout,
-                                                            CLayout>;
-
-    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
-                                                                 BDataType,
-                                                                 AccDataType,
-                                                                 CodegenGemmShape,
-                                                                 CodegenGemmTraits,
-                                                                 ComputeDataType>;
-
-    using BaseGemmPipeline = ck_tile::BaseAQuantGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
-
-    const ck_tile::index_t K_split      = (args.K + K_Tile - 1) / K_Tile * K_Tile;
-    const ck_tile::index_t num_loop     = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop             = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num  = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-    constexpr bool transposed_warp_gemm = true;
-
-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-
-        using CodegenPipelineProblem =
-            ck_tile::GemmAQuantPipelineProblem<ADataType,
-                                               AQDataType,
-                                               BDataType,
-                                               AccDataType,
-                                               CodegenGemmShape,
-                                               CodegenGemmTraits,
-                                               QuantGroupSize,
-                                               transposed_warp_gemm,
-                                               ComputeDataType,
-                                               ck_tile::GemmPipelineScheduler::Intrawave,
-                                               has_hot_loop_v,
-                                               tail_number_v>;
-        using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
-        using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-                   ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                    BDataType,
-                                                    ck_tile::tuple<>,
-                                                    AccDataType,
-                                                    CDataType,
-                                                    ck_tile::tuple<>,
-                                                    CLayout,
-                                                    ck_tile::element_wise::PassThrough,
-                                                    TilePartitioner::MPerBlock,
-                                                    TilePartitioner::NPerBlock,
-                                                    M_Warp,
-                                                    N_Warp,
-                                                    M_Warp_Tile,
-                                                    N_Warp_Tile,
-                                                    K_Warp_Tile,
-                                                    transposed_warp_gemm,
-                                                    ck_tile::memory_operation_enum::set>>;
-        using Kernel =
-            ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
-
-        auto kargs = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(args.k_batch != 1)
-        {
-            throw std::runtime_error("split-k is not supported yet!");
-        }
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenGemmShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
-}
-
-#include "run_gemm_aquant_example.inc"
-
-template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
-{
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
-                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
-                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
-    {
-        if(a_layout == "R" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
-                argc, argv, Row{}, Row{}, Col{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported memory layout for the input matrices!");
-        }
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data type for A.");
-    }
-
-    return 0;
-}
-
-template <template <typename PreType> typename GemmConfig>
-int run_gemm_example(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    std::string data_type = arg_parser.get_str("prec");
-    std::string a_layout  = arg_parser.get_str("a_layout");
-    std::string b_layout  = arg_parser.get_str("b_layout");
-
-    if(data_type == "fp8")
-    {
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "bf8")
-    {
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "i4fp8")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
-                                                        ck_tile::fp8_t,
-                                                        ck_tile::half_t,
-                                                        ck_tile::fp8_t>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "i4bf8")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
-                                                        ck_tile::bf8_t,
-                                                        ck_tile::half_t,
-                                                        ck_tile::bf8_t>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data type for this operation !!!");
-    }
-}
-
-int main(int argc, char* argv[]) { return !run_gemm_example<GemmConfigDecode>(argc, argv); }
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp
deleted file mode 100644
index 13c416110a..0000000000
--- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp
+++ /dev/null
@@ -1,231 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <stdexcept>
-#include <string>
-#include <tuple>
-
-#include "gemm_utils.hpp"
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename AQDataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ComputeDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          uint32_t QuantGroupSize>
-float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
-{
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu = 1;
-
-    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
-
-    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
-    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
-    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
-
-    constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp;
-    constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp;
-    constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
-
-    using CodegenGemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
-
-    using CodegenGemmTraits = ck_tile::TileGemmAQuantTraits<kPadM,
-                                                            kPadN,
-                                                            kPadK,
-                                                            GemmConfig::PreshuffleQuant,
-                                                            ALayout,
-                                                            BLayout,
-                                                            CLayout>;
-
-    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
-                                                                 BDataType,
-                                                                 AccDataType,
-                                                                 CodegenGemmShape,
-                                                                 CodegenGemmTraits,
-                                                                 ComputeDataType>;
-
-    using BaseGemmPipeline = ck_tile::BaseAQuantGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
-
-    const ck_tile::index_t K_split      = (args.K + K_Tile - 1) / K_Tile * K_Tile;
-    const ck_tile::index_t num_loop     = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop             = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num  = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-    constexpr bool transposed_warp_gemm = false;
-
-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-
-        using CodegenPipelineProblem =
-            ck_tile::GemmAQuantPipelineProblem<ADataType,
-                                               AQDataType,
-                                               BDataType,
-                                               AccDataType,
-                                               CodegenGemmShape,
-                                               CodegenGemmTraits,
-                                               QuantGroupSize,
-                                               transposed_warp_gemm,
-                                               ComputeDataType,
-                                               ck_tile::GemmPipelineScheduler::Intrawave,
-                                               has_hot_loop_v,
-                                               tail_number_v>;
-        using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
-        using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-                   ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                    BDataType,
-                                                    ck_tile::tuple<>,
-                                                    AccDataType,
-                                                    CDataType,
-                                                    ck_tile::tuple<>,
-                                                    CLayout,
-                                                    ck_tile::element_wise::PassThrough,
-                                                    TilePartitioner::MPerBlock,
-                                                    TilePartitioner::NPerBlock,
-                                                    M_Warp,
-                                                    N_Warp,
-                                                    M_Warp_Tile,
-                                                    N_Warp_Tile,
-                                                    K_Warp_Tile,
-                                                    transposed_warp_gemm,
-                                                    ck_tile::memory_operation_enum::set>>;
-        using Kernel =
-            ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
-
-        auto kargs = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(args.k_batch != 1)
-        {
-            throw std::runtime_error("split-k is not supported yet!");
-        }
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenGemmShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
-}
-
-#include "run_gemm_aquant_example.inc"
-
-template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
-{
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
-                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
-                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
-    {
-        if(a_layout == "R" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
-                argc, argv, Row{}, Row{}, Col{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported memory layout for the input matrices!");
-        }
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data type for A.");
-    }
-
-    return 0;
-}
-
-template <template <typename PreType> typename GemmConfig>
-int run_gemm_example(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    std::string data_type = arg_parser.get_str("prec");
-    std::string a_layout  = arg_parser.get_str("a_layout");
-    std::string b_layout  = arg_parser.get_str("b_layout");
-
-    if(data_type == "fp8")
-    {
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "bf8")
-    {
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "i4fp8")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
-                                                        ck_tile::fp8_t,
-                                                        ck_tile::half_t,
-                                                        ck_tile::fp8_t>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "i4bf8")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
-                                                        ck_tile::bf8_t,
-                                                        ck_tile::half_t,
-                                                        ck_tile::bf8_t>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data type for this operation !!!");
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    return !run_gemm_example<GemmConfigPreshuffleQuant>(argc, argv);
-}
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_basic.cpp
deleted file mode 100644
index 49e60bf86d..0000000000
--- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_basic.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <stdexcept>
-#include <string>
-#include <tuple>
-
-#include "ck_tile/core/config.hpp"
-#include "ck_tile/host.hpp"
-#include "gemm_utils.hpp"
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename BQDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ComputeDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          uint32_t QuantGroupSize>
-float gemm_calc_bquant(const ck_tile::BQuantGemmHostArgs& args, const ck_tile::stream_config& s)
-{
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
-
-    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
-    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
-    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
-
-    constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp;
-    constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp;
-    constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
-
-    using CodegenGemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
-
-    using CodegenGemmTraits = ck_tile::TileGemmBQuantTraits<kPadM,
-                                                            kPadN,
-                                                            kPadK,
-                                                            GemmConfig::PreshuffleQuant,
-                                                            ALayout,
-                                                            BLayout,
-                                                            CLayout>;
-
-    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
-                                                                 BDataType,
-                                                                 AccDataType,
-                                                                 CodegenGemmShape,
-                                                                 CodegenGemmTraits,
-                                                                 ComputeDataType>;
-
-    using BaseGemmPipeline = ck_tile::BaseBQuantGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
-
-    const ck_tile::index_t K_split      = (args.K + K_Tile - 1) / K_Tile * K_Tile;
-    const ck_tile::index_t num_loop     = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop             = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num  = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-    constexpr bool transposed_warp_gemm = false;
-
-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-
-        using CodegenPipelineProblem =
-            ck_tile::GemmBQuantPipelineProblem<ADataType,
-                                               BDataType,
-                                               BQDataType,
-                                               AccDataType,
-                                               CodegenGemmShape,
-                                               CodegenGemmTraits,
-                                               QuantGroupSize,
-                                               ComputeDataType,
-                                               ck_tile::GemmPipelineScheduler::Intrawave,
-                                               has_hot_loop_v,
-                                               tail_number_v>;
-        using CodegenGemmPipeline = ck_tile::BQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
-        using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-                   ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                    BDataType,
-                                                    ck_tile::tuple<>,
-                                                    AccDataType,
-                                                    CDataType,
-                                                    ck_tile::tuple<>,
-                                                    CLayout,
-                                                    ck_tile::element_wise::PassThrough,
-                                                    TilePartitioner::MPerBlock,
-                                                    TilePartitioner::NPerBlock,
-                                                    M_Warp,
-                                                    N_Warp,
-                                                    M_Warp_Tile,
-                                                    N_Warp_Tile,
-                                                    K_Warp_Tile,
-                                                    transposed_warp_gemm,
-                                                    ck_tile::memory_operation_enum::set>>;
-        using Kernel =
-            ck_tile::BQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
-
-        auto kargs = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(args.k_batch != 1)
-        {
-            throw std::runtime_error("split-k is not supported yet!");
-        }
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenGemmShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
-    ;
-}
-
-#include "run_gemm_bquant_example.inc"
-
-template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
-{
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    if constexpr(std::is_same_v<typename TypeConfig::BDataType, ck_tile::pk_int4_t> ||
-                 std::is_same_v<typename TypeConfig::BDataType, ck_tile::fp8_t> ||
-                 std::is_same_v<typename TypeConfig::BDataType, ck_tile::bf8_t>)
-    {
-        if(a_layout == "R" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
-                argc, argv, Row{}, Col{}, Col{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported memory layout for the input matrices!");
-        }
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data type for B.");
-    }
-
-    return 0;
-}
-
-template <template <typename PreType> typename GemmConfig>
-int run_gemm_example(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    std::string data_type = arg_parser.get_str("prec");
-    std::string a_layout  = arg_parser.get_str("a_layout");
-    std::string b_layout  = arg_parser.get_str("b_layout");
-
-    if(data_type == "fp8")
-    {
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "bf8")
-    {
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "fp8i4")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::fp8_t,
-                                                        ck_tile::pk_int4_t,
-                                                        ck_tile::half_t,
-                                                        ck_tile::fp8_t>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "bf8i4")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t,
-                                                        ck_tile::pk_int4_t,
-                                                        ck_tile::half_t,
-                                                        ck_tile::bf8_t>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data type for this operation !!!");
-    }
-}
-
-int main(int argc, char* argv[]) { return !run_gemm_example<GemmConfigDecode>(argc, argv); }
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp
new file mode 100644
index 0000000000..fa9ad967ad
--- /dev/null
+++ b/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_utils.hpp"
+
+template <typename GemmConfig,
+          typename TypeConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          uint32_t QuantGroupSize,
+          ck_tile::QuantType QuantMode,
+          typename CDEElementWise>
+float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::stream_config& s)
+{
+    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+    using ComputeDataType = std::conditional_t<QuantMode == ck_tile::QuantType::AQuantGrouped ||
+                                                   QuantMode == ck_tile::QuantType::RowColQuant,
+                                               typename TypeConfig::BDataType,
+                                               typename TypeConfig::ADataType>;
+
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<GemmShape>;
+
+    using GemmTraits = ck_tile::TileGemmQuantTraits<GemmConfig::kPadM,
+                                                    GemmConfig::kPadN,
+                                                    GemmConfig::kPadK,
+                                                    GemmConfig::PreshuffleQuant,
+                                                    GemmConfig::PreshuffleB,
+                                                    ALayout,
+                                                    BLayout,
+                                                    CLayout,
+                                                    QuantMode,
+                                                    ALayout, // for AQLayout
+                                                    BLayout, // for BQLayout
+                                                    GemmConfig::DoubleSmemBuffer>;
+
+    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<typename TypeConfig::ADataType,
+                                                                 typename TypeConfig::BDataType,
+                                                                 typename TypeConfig::AccDataType,
+                                                                 GemmShape,
+                                                                 GemmTraits,
+                                                                 ComputeDataType>;
+
+    using BaseGemmPipeline = std::conditional_t<
+        GemmConfig::PreshuffleB == true,
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<GemmPipelineProblem>,
+        ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>>;
+
+    const ck_tile::index_t K_split =
+        (args.K + GemmConfig::K_Tile - 1) / GemmConfig::K_Tile * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+        constexpr bool transpose_c    = false;
+
+        // row-col and tensor quants use the regular pipeline, A/B quants use their own
+        using PipelineProblem = std::conditional_t<
+            QuantMode == ck_tile::QuantType::RowColQuant ||
+                QuantMode == ck_tile::QuantType::TensorQuant,
+            ck_tile::GemmRowColTensorQuantPipelineProblem<typename TypeConfig::ADataType,
+                                                          typename TypeConfig::BDataType,
+                                                          typename TypeConfig::AccDataType,
+                                                          typename TypeConfig::AccDataType,
+                                                          GemmShape,
+                                                          GemmTraits,
+                                                          transpose_c,
+                                                          ComputeDataType,
+                                                          GemmConfig::Scheduler,
+                                                          has_hot_loop_v,
+                                                          tail_number_v>,
+            std::conditional_t<QuantMode == ck_tile::QuantType::AQuantGrouped,
+                               ck_tile::GemmAQuantPipelineProblem<typename TypeConfig::ADataType,
+                                                                  typename TypeConfig::QDataType,
+                                                                  typename TypeConfig::BDataType,
+                                                                  typename TypeConfig::AccDataType,
+                                                                  GemmShape,
+                                                                  GemmTraits,
+                                                                  QuantGroupSize,
+                                                                  transpose_c,
+                                                                  ComputeDataType,
+                                                                  GemmConfig::Scheduler,
+                                                                  has_hot_loop_v,
+                                                                  tail_number_v>,
+                               ck_tile::GemmBQuantPipelineProblem<typename TypeConfig::ADataType,
+                                                                  typename TypeConfig::BDataType,
+                                                                  typename TypeConfig::QDataType,
+                                                                  typename TypeConfig::AccDataType,
+                                                                  GemmShape,
+                                                                  GemmTraits,
+                                                                  QuantGroupSize,
+                                                                  ComputeDataType,
+                                                                  GemmConfig::Scheduler,
+                                                                  has_hot_loop_v,
+                                                                  tail_number_v>>>;
+
+        using GemmPipeline = std::conditional_t<
+            QuantMode == ck_tile::QuantType::RowColQuant ||
+                QuantMode == ck_tile::QuantType::TensorQuant,
+            ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>,
+            std::conditional_t<
+                QuantMode == ck_tile::QuantType::AQuantGrouped,
+                ck_tile::AQuantGemmPipelineAgBgCrCompV3<PipelineProblem>,
+                std::conditional_t<GemmConfig::PreshuffleB == true,
+                                   ck_tile::WPQuantBPipelineAgBgCrV2<PipelineProblem>,
+                                   ck_tile::BQuantGemmPipelineAgBgCrCompV3<PipelineProblem>>>>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<typename TypeConfig::ADataType,
+                                             typename TypeConfig::BDataType,
+                                             ck_tile::tuple<>,
+                                             typename TypeConfig::AccDataType,
+                                             typename TypeConfig::CDataType,
+                                             ck_tile::tuple<>,
+                                             CLayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             transpose_c,
+                                             ck_tile::memory_operation_enum::set>>;
+        using Kernel =
+            ck_tile::QuantGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue, QuantMode>;
+
+        auto kargs = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = Kernel::BlockSize();
+
+        if(args.k_batch != 1)
+        {
+            throw std::runtime_error("split-k is not supported yet!");
+        }
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << PipelineProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+        float ave_time = 0;
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+
+            ck_tile::HostTensor<typename TypeConfig::ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<typename TypeConfig::BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+            ck_tile::RotatingMemWrapper<typename TypeConfig::ADataType,
+                                        typename TypeConfig::BDataType>
+                rotating_mem(
+                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(
+                        hipMemsetAsync(args.c_ptr,
+                                       0,
+                                       args.M * args.N * sizeof(typename TypeConfig::CDataType),
+                                       s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+
+        return ave_time;
+    };
+    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+}
+
+#include "run_gemm_quant_example.inc"
+
+template <typename GemmConfig,
+          typename TypeConfig,
+          uint32_t QuantGroupSize,
+          ck_tile::QuantType QuantMode>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if((QuantMode == ck_tile::QuantType::AQuantGrouped ||
+        QuantMode == ck_tile::QuantType::RowColQuant) &&
+       GemmConfig::PreshuffleB)
+    {
+        throw std::runtime_error(
+            "Preshuffling weight matrix is not supported for AQuant or RowColQuant");
+    }
+
+    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize, QuantMode>(
+                argc, argv, Row{}, Row{}, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for A.");
+    }
+
+    return 0;
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    std::string quant_mode = arg_parser.get_str("quant_mode");
+
+    if(data_type == "fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
+
+        if(quant_mode == "aquant")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::AQuantGrouped>(
+                a_layout, b_layout, argc, argv);
+        }
+        else if(quant_mode == "bquant")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::BQuantGrouped>(
+                a_layout, b_layout, argc, argv);
+        }
+        else if(quant_mode == "rowcol")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::RowColQuant>(
+                a_layout, b_layout, argc, argv);
+        }
+        else if(quant_mode == "tensor")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::TensorQuant>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error(
+                "Unsupported quantization mode! Use 'aquant', 'bquant', 'tensor' or 'rowcol'");
+        }
+    }
+    else if(data_type == "bf8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
+
+        if(quant_mode == "aquant")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::AQuantGrouped>(
+                a_layout, b_layout, argc, argv);
+        }
+        else if(quant_mode == "bquant")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::BQuantGrouped>(
+                a_layout, b_layout, argc, argv);
+        }
+        else if(quant_mode == "rowcol")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::RowColQuant>(
+                a_layout, b_layout, argc, argv);
+        }
+        else if(quant_mode == "tensor")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::TensorQuant>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error(
+                "Unsupported quantization mode! Use 'aquant', 'bquant', 'tensor' or 'rowcol'");
+        }
+    }
+    else if(data_type == "i4fp8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::fp8_t,
+                                                        ck_tile::half_t,
+                                                        ck_tile::fp8_t>{});
+
+        if(quant_mode == "aquant")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::AQuantGrouped>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error(
+                "Unsupported quantization mode for this datatype! Use 'aquant'.");
+        }
+    }
+    else if(data_type == "i4bf8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::bf8_t,
+                                                        ck_tile::half_t,
+                                                        ck_tile::bf8_t>{});
+
+        if(quant_mode == "aquant")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::AQuantGrouped>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error(
+                "Unsupported quantization mode for this datatype! Use 'aquant'.");
+        }
+    }
+    else if(data_type == "fp8i4")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::fp8_t,
+                                                        ck_tile::pk_int4_t,
+                                                        ck_tile::half_t,
+                                                        ck_tile::fp8_t>{});
+
+        if(quant_mode == "bquant")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::BQuantGrouped>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error(
+                "Unsupported quantization mode for this datatype! Use 'bquant'.");
+        }
+    }
+    else if(data_type == "bf8i4")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t,
+                                                        ck_tile::pk_int4_t,
+                                                        ck_tile::half_t,
+                                                        ck_tile::bf8_t>{});
+
+        if(quant_mode == "bquant")
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                              TypeConfig,
+                                              128,
+                                              ck_tile::QuantType::BQuantGrouped>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error(
+                "Unsupported quantization mode for this datatype! Use 'bquant'.");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    return !run_gemm_example<GemmConfigPreshuffleB_Bquant_decode>(argc, argv);
+}
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
index d64297cb35..cfe7b72af9 100644
--- a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
@@ -9,16 +9,12 @@
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
-#include "ck_tile/ops/gemm_group_quant.hpp"
-
-#define CK_TILE_PIPELINE_PREFILL 1
-#define CK_TILE_PIPELINE_DECODE 2
-#define CK_TILE_PIPELINE_PRESHUFFLEQUANT 3
+#include "ck_tile/ops/gemm_quant.hpp"
 
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile()
 {
-#if defined(__gfx950__)
+#if defined(CK_GFX950_SUPPORT)
     constexpr bool is_8bit_float =
         std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
     if constexpr(M_Warp_Tile == 32)
@@ -35,7 +31,7 @@ constexpr ck_tile::index_t get_k_warp_tile()
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_from_preshuffled_warp_tile()
 {
-#if defined(__gfx950__)
+#if defined(CK_GFX950_SUPPORT)
     if constexpr(M_Warp_Tile == 32)
         return sizeof(PrecType) == 2 ? 16 : 64;
     else
@@ -48,6 +44,13 @@ constexpr ck_tile::index_t get_k_from_preshuffled_warp_tile()
 #endif
 }
 
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
 template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
                          const ck_tile::index_t kbatch,
@@ -81,16 +84,19 @@ struct GemmConfigBase
     static constexpr bool TransposeC            = false;
     static constexpr bool UseStructuredSparsity = false;
 
+    static constexpr int kBlockPerCu = 1;
+    static constexpr auto Scheduler  = ck_tile::GemmPipelineScheduler::Intrawave;
+
     static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
     static constexpr ck_tile::index_t TileParitionerM01      = 4;
-    static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave;
 
     static constexpr bool PreshuffleQuant  = false;
+    static constexpr bool PreshuffleB      = false;
     static constexpr bool DoubleSmemBuffer = false;
 };
 
 template <typename PrecType>
-struct GemmConfigDecode : public GemmConfigBase
+struct GemmConfigQuant : public GemmConfigBase
 {
     static constexpr ck_tile::index_t M_Tile = 16;
     static constexpr ck_tile::index_t N_Tile = 64;
@@ -103,19 +109,14 @@ struct GemmConfigDecode : public GemmConfigBase
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
     static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr int kBlockPerCu = 1;
-
-    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_DECODE;
 };
 
 template <typename PrecType>
-struct GemmConfigPrefill : public GemmConfigBase
+struct GemmConfigRowColQuant : public GemmConfigBase
 {
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
 
     static constexpr ck_tile::index_t M_Warp = 1;
     static constexpr ck_tile::index_t N_Warp = 4;
@@ -124,10 +125,6 @@ struct GemmConfigPrefill : public GemmConfigBase
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
     static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr int kBlockPerCu           = 2;
-    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PREFILL;
 };
 
 template <typename PrecType>
@@ -146,11 +143,27 @@ struct GemmConfigPreshuffleQuant : public GemmConfigBase
     static constexpr ck_tile::index_t K_Warp_Tile =
         get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();
 
-    static constexpr int kBlockPerCu = 1;
+    static constexpr bool PreshuffleQuant = true;
+};
 
-    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLEQUANT;
-    static constexpr bool PreshuffleQuant      = true;
+template <typename PrecType>
+struct GemmConfigPreshuffleB_Bquant_decode : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool PreshuffleB      = true;
+    static constexpr bool DoubleSmemBuffer = true;
 };
 
 template <typename ADataType_,
@@ -230,26 +243,26 @@ auto create_args(int argc, char* argv[])
         .insert("n", "4096", "n dimension")
         .insert("k", "2048", "k dimension")
         .insert("a_layout", "R", "A tensor data layout - Row by default")
-        .insert("aq_layout", "R", "Aq tensor data layout - Row by default")
         .insert("b_layout", "C", "B tensor data layout - Column by default")
+        .insert("bq_layout", "C", "Bq tensor data layout - Column by default")
         .insert("c_layout", "R", "C tensor data layout - Row by default")
         .insert("stride_a", "0", "Tensor A stride")
         .insert("stride_q", "0", "Tensor AQ stride")
         .insert("stride_b", "0", "Tensor B stride")
         .insert("stride_c", "0", "Tensor C stride")
-        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
-        .insert("prec", "i4fp8", "data type. fp8/bf8/i4fp8/i4bf8/i4f32fp8/i4f32bf8")
+        .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec",
+                "fp8",
+                "data type. For AQuant: fp8/bf8/i4fp8/i4bf8, For Bquant: fp8/bf8/fp8i4/bf8i4")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
         .insert("repeat", "1000", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("split_k", "1", "splitK value")
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
-        .insert("persistent", "0", "0:non-persistent, 1:persistent")
-        .insert("as_br_cr", "false", "Choose between as_br_cr and as_bs_cr");
+        .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
+        .insert("rotating_count", "1000", "rotating count, defaults to 1")
+        .insert("quant_mode", "aquant", "Choose aquant (default), bquant, tensor or rowcol");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
-
-// host API
-float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc b/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
deleted file mode 100644
index 8cf77cb011..0000000000
--- a/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
+++ /dev/null
@@ -1,293 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-#include <bit>
-#include <random>
-#include <stdexcept>
-
-template <typename Layout>
-static constexpr inline auto is_row_major(Layout layout_)
-{
-    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
-                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-template <typename T>
-auto shuffle_aq(const ck_tile::HostTensor<T>& t, int block_aq_k)
-{
-    if(t.get_lengths().size() != 2)
-    {
-        throw std::runtime_error("Host tensor is not rank 2 tensor.");
-    }
-    int m_   = t.get_lengths()[0];
-    int aqk_ = t.get_lengths()[1];
-    if(aqk_ % block_aq_k != 0)
-    {
-        throw std::runtime_error("shuffle_aq needs a aqk of multiple times of block_aq_k.");
-    }
-    ck_tile::HostTensor<T> t_view({m_, aqk_ / block_aq_k, block_aq_k});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {1, 0, 2});
-}
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename AQDataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename AQLayout,
-          typename BLayout,
-          typename CLayout,
-          uint32_t QuantGroupSize>
-float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
-                  ck_tile::DeviceMem& aq_m_aqk_dev_buf,
-                  ck_tile::DeviceMem& b_k_n_dev_buf,
-                  ck_tile::DeviceMem& c_m_n_dev_buf,
-                  ck_tile::index_t M,
-                  ck_tile::index_t N,
-                  ck_tile::index_t K,
-                  ck_tile::index_t AQK,
-                  ck_tile::index_t stride_A,
-                  ck_tile::index_t stride_AQ,
-                  ck_tile::index_t stride_B,
-                  ck_tile::index_t stride_C,
-                  ck_tile::index_t kbatch,
-                  int n_warmup,
-                  int n_repeat)
-{
-    ck_tile::AQuantGemmHostArgs args;
-    args.a_ptr     = a_m_k_dev_buf.GetDeviceBuffer();
-    args.aq_ptr    = aq_m_aqk_dev_buf.GetDeviceBuffer();
-    args.b_ptr     = b_k_n_dev_buf.GetDeviceBuffer();
-    args.c_ptr     = c_m_n_dev_buf.GetDeviceBuffer();
-    args.k_batch   = kbatch;
-    args.M         = M;
-    args.N         = N;
-    args.K         = K;
-    args.QK        = AQK;
-    args.stride_A  = stride_A;
-    args.stride_B  = stride_B;
-    args.stride_C  = stride_C;
-    args.stride_AQ = stride_AQ;
-
-    float ave_time = gemm_calc_aquant<GemmConfig,
-                                      ADataType,
-                                      AQDataType,
-                                      BDataType,
-                                      AccDataType,
-                                      CDataType,
-                                      BDataType,
-                                      ALayout,
-                                      BLayout,
-                                      CLayout,
-                                      QuantGroupSize>(
-        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
-
-    std::size_t flop     = std::size_t(2) * M * N * K;
-    std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(AQDataType) * M * AQK +
-                           sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideAQ =" << stride_AQ << " StrideB =" << stride_B
-              << " StrideC =" << stride_C << " A_Layout =" << ALayout::name
-              << " B_Layout =" << BLayout::name << " C_Layout =" << CLayout::name
-              << " A_Type = " << DataTypeTraits<ADataType>::name
-              << " AQ_Type = " << DataTypeTraits<AQDataType>::name
-              << " B_Type = " << DataTypeTraits<BDataType>::name
-              << " Acc_Type = " << DataTypeTraits<AccDataType>::name
-              << " C_Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
-
-    return ave_time;
-}
-
-template <typename GemmConfig,
-          typename TypeConfig,
-          uint32_t QuantGroupSize,
-          typename ALayout,
-          typename AQLayout,
-          typename BLayout,
-          typename CLayout>
-int run_gemm_example_with_layouts(int argc,
-                                  char* argv[],
-                                  const ALayout a_layout                  = ALayout{},
-                                  const AQLayout aq_layout                = AQLayout{},
-                                  const BLayout b_layout                  = BLayout{},
-                                  [[maybe_unused]] const CLayout c_layout = CLayout{})
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    using ADataType   = typename TypeConfig::ADataType;
-    using AQDataType  = typename TypeConfig::QDataType;
-    using BDataType   = typename TypeConfig::BDataType;
-    using AccDataType = typename TypeConfig::AccDataType;
-    using CDataType   = typename TypeConfig::CDataType;
-
-    ck_tile::index_t M = arg_parser.get_int("m");
-    ck_tile::index_t N = arg_parser.get_int("n");
-    ck_tile::index_t K = arg_parser.get_int("k");
-
-    if(K % QuantGroupSize != 0)
-    {
-        throw std::runtime_error("K must be aligned with QuantGroupSize");
-    }
-
-    ck_tile::index_t AQK = K / QuantGroupSize;
-
-    ck_tile::index_t stride_A  = arg_parser.get_int("stride_a");
-    ck_tile::index_t stride_AQ = arg_parser.get_int("stride_q");
-    ck_tile::index_t stride_B  = arg_parser.get_int("stride_b");
-    ck_tile::index_t stride_C  = arg_parser.get_int("stride_c");
-
-    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
-    int n_warmup                 = arg_parser.get_int("warmup");
-    int n_repeat                 = arg_parser.get_int("repeat");
-    ck_tile::index_t init_method = arg_parser.get_int("init");
-
-    stride_A  = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
-    stride_AQ = ck_tile::get_default_stride(M, AQK, stride_AQ, is_row_major(aq_layout));
-    stride_B  = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
-    stride_C  = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
-
-    ck_tile::HostTensor<ADataType> a_m_k(
-        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
-    ck_tile::HostTensor<AQDataType> aq_m_aqk(
-        ck_tile::host_tensor_descriptor(M, AQK, stride_AQ, is_row_major(aq_layout)));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<std::uint32_t> fill_seed(0, 500);
-
-    if(init_method == 0)
-    {
-        if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
-        {
-            ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
-                a_m_k);
-        }
-        else
-        {
-            ck_tile::FillUniformDistribution<ADataType>{-2.0f, 3.0f, fill_seed(gen)}(a_m_k);
-        }
-        ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(aq_m_aqk);
-        ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f, fill_seed(gen)}(b_k_n);
-    }
-    else if(init_method == 1)
-    {
-        std::cout << "Monotonic initialization is not supported." << std::endl;
-        return 0;
-    }
-    else if(init_method == 2)
-    {
-        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x22)}(a_m_k);
-        ck_tile::FillConstant<AQDataType>{static_cast<AQDataType>(0.5f)}(aq_m_aqk);
-        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x38)}(b_k_n);
-    }
-    else
-    {
-        a_m_k.SetZero();
-        aq_m_aqk.SetZero();
-        b_k_n.SetZero();
-    }
-
-    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem aq_m_aqk_dev_buf(aq_m_aqk.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
-
-    if constexpr(GemmConfig::PreshuffleQuant)
-    {
-        ck_tile::HostTensor<AQDataType> aq_shuffle_host =
-            shuffle_aq(aq_m_aqk, GemmConfig::K_Tile / QuantGroupSize);
-        aq_m_aqk_dev_buf.ToDevice(aq_shuffle_host.data());
-    }
-    else
-    {
-        aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
-    }
-
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    b_k_n_dev_buf.ToDevice(b_k_n.data());
-    c_m_n_dev_buf.SetZero();
-    c_m_n_dev_result.SetZero();
-
-    invoke_gemm<GemmConfig,
-                ADataType,
-                AQDataType,
-                BDataType,
-                AccDataType,
-                CDataType,
-                ALayout,
-                AQLayout,
-                BLayout,
-                CLayout,
-                QuantGroupSize>(a_m_k_dev_buf,
-                                aq_m_aqk_dev_buf,
-                                b_k_n_dev_buf,
-                                c_m_n_dev_buf,
-                                M,
-                                N,
-                                K,
-                                AQK,
-                                stride_A,
-                                stride_AQ,
-                                stride_B,
-                                stride_C,
-                                kbatch,
-                                n_warmup,
-                                n_repeat);
-
-    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-    bool pass = true;
-
-    if(arg_parser.get_int("v") == 1)
-    {
-        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        c_m_n_host_ref.SetZero();
-
-        ck_tile::reference_gemm_quant<ADataType,
-                                      AQDataType,
-                                      BDataType,
-                                      AccDataType,
-                                      CDataType,
-                                      QuantGroupSize,
-                                      true>(a_m_k, aq_m_aqk, b_k_n, c_m_n_host_ref);
-        const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_host_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        if(!pass)
-        {
-            std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                      << std::endl;
-        }
-        std::cout << "CPU verification " << (pass ? "Passed!" : "Failed ...") << std::endl;
-    }
-    else if(arg_parser.get_int("v") == 2)
-    {
-        std::cout << "GPU verification is not implemented yet. Re-run with -v=1" << std::endl;
-        return false;
-    }
-
-    return pass;
-}
diff --git a/example/ck_tile/38_block_scale_gemm/run_gemm_bquant_example.inc b/example/ck_tile/38_block_scale_gemm/run_gemm_bquant_example.inc
deleted file mode 100644
index e3e11bb0a9..0000000000
--- a/example/ck_tile/38_block_scale_gemm/run_gemm_bquant_example.inc
+++ /dev/null
@@ -1,286 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-#include <bit>
-#include <random>
-
-template <typename Layout>
-static constexpr inline auto is_row_major(Layout layout_)
-{
-    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
-                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-template <typename T>
-auto shuffle_bq(const ck_tile::HostTensor<T>& t, int block_bq_k)
-{
-    if(t.get_lengths().size() != 2)
-    {
-        throw std::runtime_error("Host tensor is not rank 2 tensor.");
-    }
-    int n_   = t.get_lengths()[0];
-    int bqk_ = t.get_lengths()[1];
-    if(bqk_ % block_bq_k != 0)
-    {
-        throw std::runtime_error("shuffle_aq needs a bqk of multiple times of block_bq_k.");
-    }
-    ck_tile::HostTensor<T> t_view({n_, bqk_ / block_bq_k, block_bq_k});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {1, 0, 2});
-}
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename BQDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename BQLayout,
-          typename DsLayout,
-          typename CLayout,
-          uint32_t QuantGroupSize,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
-                  ck_tile::DeviceMem& b_k_n_dev_buf,
-                  ck_tile::DeviceMem& bq_bqk_n_dev_buf,
-                  ck_tile::DeviceMem& c_m_n_dev_buf,
-                  ck_tile::index_t M,
-                  ck_tile::index_t N,
-                  ck_tile::index_t K,
-                  ck_tile::index_t BQK,
-                  ck_tile::index_t stride_A,
-                  ck_tile::index_t stride_B,
-                  ck_tile::index_t stride_BQ,
-                  ck_tile::index_t stride_C,
-                  ck_tile::index_t kbatch,
-                  int n_warmup,
-                  int n_repeat)
-{
-    ck_tile::BQuantGemmHostArgs args;
-    args.a_ptr     = a_m_k_dev_buf.GetDeviceBuffer();
-    args.b_ptr     = b_k_n_dev_buf.GetDeviceBuffer();
-    args.bq_ptr    = bq_bqk_n_dev_buf.GetDeviceBuffer();
-    args.c_ptr     = c_m_n_dev_buf.GetDeviceBuffer();
-    args.k_batch   = kbatch;
-    args.M         = M;
-    args.N         = N;
-    args.K         = K;
-    args.QK        = BQK;
-    args.stride_A  = stride_A;
-    args.stride_B  = stride_B;
-    args.stride_C  = stride_C;
-    args.stride_BQ = stride_BQ;
-
-    float ave_time = gemm_calc_bquant<GemmConfig,
-                                      ADataType,
-                                      BDataType,
-                                      BQDataType,
-                                      AccDataType,
-                                      CDataType,
-                                      ADataType, // computeDatatype
-                                      ALayout,
-                                      BLayout,
-                                      CLayout,
-                                      QuantGroupSize>(
-        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
-
-    std::size_t flop     = std::size_t(2) * M * N * K;
-    std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * N * K +
-                           sizeof(BQDataType) * BQK * N + sizeof(CDataType) * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideBQ =" << stride_BQ
-              << " StrideC =" << stride_C << " A_Layout =" << ALayout::name
-              << " B_Layout =" << BLayout::name << " C_Layout =" << CLayout::name
-              << " A_Type = " << DataTypeTraits<ADataType>::name
-              << " B_Type = " << DataTypeTraits<BDataType>::name
-              << " BQ_Type = " << DataTypeTraits<BQDataType>::name
-              << " Acc_Type = " << DataTypeTraits<AccDataType>::name
-              << " C_Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
-
-    return ave_time;
-}
-
-template <typename GemmConfig,
-          typename TypeConfig,
-          uint32_t QuantGroupSize,
-          typename ALayout,
-          typename BLayout,
-          typename BQLayout,
-          typename CLayout>
-int run_gemm_example_with_layouts(int argc,
-                                  char* argv[],
-                                  const ALayout a_layout                  = ALayout{},
-                                  const BLayout b_layout                  = BLayout{},
-                                  const BQLayout bq_layout                = BQLayout{},
-                                  [[maybe_unused]] const CLayout c_layout = CLayout{})
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    using ADataType   = typename TypeConfig::ADataType;
-    using BDataType   = typename TypeConfig::BDataType;
-    using BQDataType  = typename TypeConfig::QDataType;
-    using AccDataType = typename TypeConfig::AccDataType;
-    using CDataType   = typename TypeConfig::CDataType;
-
-    ck_tile::index_t M = arg_parser.get_int("m");
-    ck_tile::index_t N = arg_parser.get_int("n");
-    ck_tile::index_t K = arg_parser.get_int("k");
-
-    if(K % QuantGroupSize != 0)
-    {
-        throw std::runtime_error("K must be aligned with QuantGroupSize");
-    }
-
-    ck_tile::index_t BQK = K / QuantGroupSize;
-
-    ck_tile::index_t stride_A  = arg_parser.get_int("stride_a");
-    ck_tile::index_t stride_B  = arg_parser.get_int("stride_b");
-    ck_tile::index_t stride_BQ = arg_parser.get_int("stride_q");
-    ck_tile::index_t stride_C  = arg_parser.get_int("stride_c");
-
-    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
-    int n_warmup                 = arg_parser.get_int("warmup");
-    int n_repeat                 = arg_parser.get_int("repeat");
-    ck_tile::index_t init_method = arg_parser.get_int("init");
-
-    stride_A  = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
-    stride_B  = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
-    stride_BQ = ck_tile::get_default_stride(BQK, N, stride_BQ, is_row_major(bq_layout));
-    stride_C  = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
-
-    ck_tile::HostTensor<ADataType> a_m_k(
-        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
-    ck_tile::HostTensor<BQDataType> bq_bqk_n(
-        ck_tile::host_tensor_descriptor(BQK, N, stride_BQ, is_row_major(bq_layout)));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<std::uint32_t> fill_seed(0, 500);
-
-    if(init_method == 0)
-    {
-        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-        {
-            ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
-                b_k_n);
-        }
-        else
-        {
-            ck_tile::FillUniformDistribution<BDataType>{-2.0f, 3.0f, fill_seed(gen)}(b_k_n);
-        }
-        ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(bq_bqk_n);
-        ck_tile::FillUniformDistribution<ADataType>{-5.0f, 5.0f, fill_seed(gen)}(a_m_k);
-    }
-    else if(init_method == 1)
-    {
-        std::cout << "Monotonic initialization is not supported." << std::endl;
-        return 0;
-    }
-    else if(init_method == 2)
-    {
-        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x38)}(a_m_k);
-        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x22)}(b_k_n);
-        ck_tile::FillConstant<BQDataType>{static_cast<BQDataType>(0.5f)}(bq_bqk_n);
-    }
-    else
-    {
-        a_m_k.SetZero();
-        b_k_n.SetZero();
-        bq_bqk_n.SetZero();
-    }
-
-    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem bq_bqk_n_dev_buf(bq_bqk_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
-
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    b_k_n_dev_buf.ToDevice(b_k_n.data());
-    bq_bqk_n_dev_buf.ToDevice(bq_bqk_n.data());
-    c_m_n_dev_buf.SetZero();
-    c_m_n_dev_result.SetZero();
-
-    invoke_gemm<GemmConfig,
-                ADataType,
-                BDataType,
-                BQDataType,
-                ck_tile::tuple<>,
-                AccDataType,
-                CDataType,
-                ALayout,
-                BLayout,
-                BQLayout,
-                ck_tile::tuple<>,
-                CLayout,
-                QuantGroupSize>(a_m_k_dev_buf,
-                                b_k_n_dev_buf,
-                                bq_bqk_n_dev_buf,
-                                c_m_n_dev_buf,
-                                M,
-                                N,
-                                K,
-                                BQK,
-                                stride_A,
-                                stride_B,
-                                stride_BQ,
-                                stride_C,
-                                kbatch,
-                                n_warmup,
-                                n_repeat);
-
-    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-    bool pass = true;
-
-    if(arg_parser.get_int("v") == 1)
-    {
-        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        c_m_n_host_ref.SetZero();
-
-        ck_tile::reference_gemm_quant<ADataType,
-                                      BQDataType,
-                                      BDataType,
-                                      AccDataType,
-                                      CDataType,
-                                      QuantGroupSize,
-                                      false>(a_m_k, bq_bqk_n, b_k_n, c_m_n_host_ref);
-        const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_host_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        if(!pass)
-        {
-            std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                      << std::endl;
-        }
-        std::cout << "CPU verification " << (pass ? "Passed!" : "Failed ...") << std::endl;
-    }
-    else if(arg_parser.get_int("v") == 2)
-    {
-        std::cout << "GPU verification is not implemented yet. Re-run with -v=1" << std::endl;
-        return false;
-    }
-
-    return pass;
-}
diff --git a/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc
new file mode 100644
index 0000000000..e68eb23641
--- /dev/null
+++ b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc
@@ -0,0 +1,548 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <random>
+#include <stdexcept>
+#include "ck_tile/host/permute_pk_int4.hpp"
+
+template <typename T>
+auto shuffle_aq(const ck_tile::HostTensor<T>* t, int block_aq_k)
+{
+    if(t->get_lengths().size() != 2)
+    {
+        throw std::runtime_error("Host tensor is not rank 2 tensor.");
+    }
+    int m_   = t->get_lengths()[0];
+    int aqk_ = t->get_lengths()[1];
+    if(aqk_ % block_aq_k != 0)
+    {
+        throw std::runtime_error("shuffle_aq needs a aqk of multiple times of block_aq_k.");
+    }
+    ck_tile::HostTensor<T> t_view({m_, aqk_ / block_aq_k, block_aq_k});
+    std::copy(t->begin(), t->end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {1, 0, 2});
+}
+
+template <typename GemmConfig, typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_                = t.get_lengths()[1];
+    int k_                = t.get_lengths()[0];
+    constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+    ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                   GemmConfig::N_Warp_Tile,
+                                   k_ / GemmConfig::K_Warp_Tile,
+                                   divisor,
+                                   GemmConfig::K_Warp_Tile / divisor});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+}
+
+template <typename GemmConfig,
+          typename TypeConfig,
+          typename ALayout,
+          typename AQLayout,
+          typename BLayout,
+          typename BQLayout,
+          typename CLayout,
+          uint32_t QuantGroupSize,
+          ck_tile::QuantType QuantMode,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
+                  ck_tile::DeviceMem* aq_dev_buf,
+                  ck_tile::DeviceMem& b_k_n_dev_buf,
+                  ck_tile::DeviceMem* bq_dev_buf,
+                  ck_tile::DeviceMem& c_m_n_dev_buf,
+                  ck_tile::index_t M,
+                  ck_tile::index_t N,
+                  ck_tile::index_t K,
+                  ck_tile::index_t AQK,
+                  ck_tile::index_t BQK,
+                  ck_tile::index_t stride_A,
+                  ck_tile::index_t stride_AQ,
+                  ck_tile::index_t stride_B,
+                  ck_tile::index_t stride_BQ,
+                  ck_tile::index_t stride_C,
+                  ck_tile::index_t kbatch,
+                  int n_warmup,
+                  int n_repeat,
+                  bool flush_cache,
+                  int rotating_count)
+{
+    ck_tile::QuantGemmHostArgs args;
+    args.a_ptr     = a_m_k_dev_buf.GetDeviceBuffer();
+    args.aq_ptr    = (aq_dev_buf != nullptr) ? aq_dev_buf->GetDeviceBuffer() : nullptr;
+    args.b_ptr     = b_k_n_dev_buf.GetDeviceBuffer();
+    args.bq_ptr    = (bq_dev_buf != nullptr) ? bq_dev_buf->GetDeviceBuffer() : nullptr;
+    args.c_ptr     = c_m_n_dev_buf.GetDeviceBuffer();
+    args.k_batch   = kbatch;
+    args.M         = M;
+    args.N         = N;
+    args.K         = K;
+    args.QK_A      = AQK;
+    args.QK_B      = BQK;
+    args.stride_A  = stride_A;
+    args.stride_B  = stride_B;
+    args.stride_C  = stride_C;
+    args.stride_AQ = stride_AQ;
+    args.stride_BQ = stride_BQ;
+
+    float ave_time = gemm_calc_quant<GemmConfig,
+                                     TypeConfig,
+                                     ALayout,
+                                     BLayout,
+                                     CLayout,
+                                     QuantGroupSize,
+                                     QuantMode,
+                                     CDEElementWise>(
+        args,
+        ck_tile::stream_config{
+            nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
+
+    std::size_t flop     = std::size_t(2) * M * N * K;
+    std::size_t num_byte = sizeof(typename TypeConfig::ADataType) * M * K +
+                           sizeof(typename TypeConfig::BDataType) * N * K +
+                           sizeof(typename TypeConfig::CDataType) * M * N;
+    if(aq_dev_buf != nullptr)
+    {
+        num_byte += sizeof(typename TypeConfig::QDataType) * M * AQK;
+    }
+    if(bq_dev_buf != nullptr)
+    {
+        num_byte += sizeof(typename TypeConfig::QDataType) * N * BQK;
+    }
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideAQ =" << stride_AQ << " StrideB =" << stride_B
+              << " StrideC =" << stride_C << " A_Layout =" << ALayout::name
+              << " B_Layout =" << BLayout::name << " C_Layout =" << CLayout::name;
+    if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped ||
+                 QuantMode == ck_tile::QuantType::RowColQuant)
+    {
+        std::cout << " StrideBQ =" << stride_BQ;
+    }
+    std::cout << " A_Type = " << DataTypeTraits<typename TypeConfig::ADataType>::name
+              << " AQ_Type = " << DataTypeTraits<typename TypeConfig::QDataType>::name
+              << " B_Type = " << DataTypeTraits<typename TypeConfig::BDataType>::name;
+    if constexpr(!std::is_same_v<typename TypeConfig::QDataType, void>)
+    {
+        std::cout << " BQ_Type = " << DataTypeTraits<typename TypeConfig::QDataType>::name;
+    }
+    std::cout << " Acc_Type = " << DataTypeTraits<typename TypeConfig::AccDataType>::name
+              << " C_Type = " << DataTypeTraits<typename TypeConfig::CDataType>::name
+              << " QuantMode = " << quant_type_to_string(QuantMode)
+              << " PreshuffleQuant = " << (GemmConfig::PreshuffleQuant ? "true" : "false") << " : "
+              << " PreshuffleB = " << (GemmConfig::PreshuffleB ? "true" : "false") << " : "
+              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename TypeConfig,
+          uint32_t QuantGroupSize,
+          ck_tile::QuantType QuantMode,
+          typename ALayout,
+          typename AQLayout,
+          typename BLayout,
+          typename BQLayout,
+          typename CLayout>
+int run_gemm_example_with_layouts(int argc,
+                                  char* argv[],
+                                  const ALayout a_layout                  = ALayout{},
+                                  const AQLayout aq_layout                = AQLayout{},
+                                  const BLayout b_layout                  = BLayout{},
+                                  const BQLayout bq_layout                = BQLayout{},
+                                  [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using ADataType   = typename TypeConfig::ADataType;
+    using AQDataType  = typename TypeConfig::QDataType;
+    using BDataType   = typename TypeConfig::BDataType;
+    using BQDataType  = typename TypeConfig::QDataType;
+    using AccDataType = typename TypeConfig::AccDataType;
+    using CDataType   = typename TypeConfig::CDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped ||
+                 QuantMode == ck_tile::QuantType::BQuantGrouped)
+    {
+        if(K % QuantGroupSize != 0)
+        {
+            throw std::runtime_error(
+                "K must be aligned with QuantGroupSize for AQuantGrouped mode");
+        }
+    }
+    ck_tile::index_t AQK, BQK;
+    if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
+    {
+        AQK = K / QuantGroupSize; // Group quantization: AQK = K / GroupSize
+        BQK = 0;                  // No B quantization
+    }
+    else if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
+    {
+        AQK = 0;                  // No A quantization
+        BQK = K / QuantGroupSize; // Group quantization: BQK = K / GroupSize
+    }
+    else if constexpr(QuantMode == ck_tile::QuantType::RowColQuant ||
+                      QuantMode == ck_tile::QuantType::TensorQuant)
+    {
+        AQK = 1; // Row quantization: tensor shape [M, 1] or [1]
+        BQK = 1; // Column quantization: tensor shape [1, N] or [1]
+    }
+    else
+    {
+        static_assert(false, "Unsupported QuantMode");
+    }
+
+    ck_tile::index_t stride_A  = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_AQ = arg_parser.get_int("stride_q");
+    ck_tile::index_t stride_B  = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C  = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t stride_BQ = arg_parser.get_int("stride_q");
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool flush_cache             = arg_parser.get_bool("flush_cache");
+    int rotating_count           = arg_parser.get_int("rotating_count");
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    // Conditional stride calculation based on QuantMode
+    if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
+    {
+        stride_AQ = ck_tile::get_default_stride(M, AQK, stride_AQ, is_row_major(aq_layout));
+        stride_BQ = 0; // No B quantization
+    }
+    else if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
+    {
+        stride_AQ = 0; // No A quantization
+        stride_BQ = ck_tile::get_default_stride(BQK, N, stride_BQ, is_row_major(bq_layout));
+    }
+    else if constexpr(QuantMode == ck_tile::QuantType::RowColQuant)
+    {
+        stride_AQ = ck_tile::get_default_stride(M, 1, stride_AQ, is_row_major(aq_layout));
+        stride_BQ = ck_tile::get_default_stride(1, N, stride_BQ, is_row_major(bq_layout));
+    }
+    else if constexpr(QuantMode == ck_tile::QuantType::TensorQuant)
+    {
+        stride_AQ = 1; // Tensor quantization: tensor shape [1]
+        stride_BQ = 1; // Tensor quantization: tensor shape [1]
+    }
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    // Create AQ tensor with appropriate shape
+    std::unique_ptr<ck_tile::HostTensor<AQDataType>> aq_tensor_ptr = nullptr;
+    if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped ||
+                 QuantMode == ck_tile::QuantType::RowColQuant)
+    {
+        aq_tensor_ptr = std::make_unique<ck_tile::HostTensor<AQDataType>>(
+            ck_tile::host_tensor_descriptor(M, AQK, stride_AQ, is_row_major(aq_layout)));
+    }
+    else if constexpr(QuantMode == ck_tile::QuantType::TensorQuant)
+    {
+        aq_tensor_ptr = std::make_unique<ck_tile::HostTensor<AQDataType>>(
+            ck_tile::host_tensor_descriptor(1, 1, stride_AQ, is_row_major(aq_layout)));
+    }
+
+    // Create BQ tensor with appropriate shape
+    std::unique_ptr<ck_tile::HostTensor<BQDataType>> bq_tensor_ptr = nullptr;
+    if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped ||
+                 QuantMode == ck_tile::QuantType::RowColQuant)
+    {
+        bq_tensor_ptr = std::make_unique<ck_tile::HostTensor<BQDataType>>(
+            ck_tile::host_tensor_descriptor(BQK, N, stride_BQ, is_row_major(bq_layout)));
+    }
+    else if constexpr(QuantMode == ck_tile::QuantType::TensorQuant)
+    {
+        bq_tensor_ptr = std::make_unique<ck_tile::HostTensor<BQDataType>>(
+            ck_tile::host_tensor_descriptor(1, 1, stride_BQ, is_row_major(bq_layout)));
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<std::uint32_t> fill_seed(0, 500);
+
+    if(init_method == 0)
+    {
+        if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
+        {
+            if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+            {
+                ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
+                    b_k_n);
+            }
+            else
+            {
+                ck_tile::FillUniformDistribution<BDataType>{-2.0f, 3.0f, fill_seed(gen)}(b_k_n);
+            }
+            ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
+                *bq_tensor_ptr);
+            ck_tile::FillUniformDistribution<ADataType>{-5.0f, 5.0f, fill_seed(gen)}(a_m_k);
+        }
+        else if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
+        {
+            if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
+            {
+                ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
+                    a_m_k);
+            }
+            else
+            {
+                ck_tile::FillUniformDistribution<ADataType>{-2.0f, 3.0f, fill_seed(gen)}(a_m_k);
+            }
+            ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
+                *aq_tensor_ptr);
+            ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f, fill_seed(gen)}(b_k_n);
+        }
+        else
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-2.0f, 2.0f, fill_seed(gen)}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-2.0f, 2.0f, fill_seed(gen)}(b_k_n);
+            ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
+                *aq_tensor_ptr);
+            ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
+                *bq_tensor_ptr);
+        }
+    }
+    else if(init_method == 1)
+    {
+        std::cout << "Monotonic initialization is not supported." << std::endl;
+        return 0;
+    }
+    else if(init_method == 2)
+    {
+        if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
+        {
+            ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x38)}(a_m_k);
+            ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x22)}(b_k_n);
+            ck_tile::FillConstant<BQDataType>{static_cast<BQDataType>(0.5f)}(*bq_tensor_ptr);
+        }
+        else
+        {
+            ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x22)}(a_m_k);
+            ck_tile::FillConstant<AQDataType>{static_cast<AQDataType>(0.5f)}(*aq_tensor_ptr);
+            ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x38)}(b_k_n);
+
+            if constexpr(QuantMode == ck_tile::QuantType::RowColQuant)
+            {
+                ck_tile::FillConstant<BQDataType>{static_cast<BQDataType>(0.5f)}(*bq_tensor_ptr);
+            }
+        }
+    }
+    else
+    {
+        a_m_k.SetZero();
+        aq_tensor_ptr->SetZero();
+        b_k_n.SetZero();
+        bq_tensor_ptr->SetZero();
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    std::unique_ptr<ck_tile::DeviceMem> aq_dev_buf_ptr = nullptr;
+    if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped ||
+                 QuantMode == ck_tile::QuantType::RowColQuant ||
+                 QuantMode == ck_tile::QuantType::TensorQuant)
+    {
+        aq_dev_buf_ptr =
+            std::make_unique<ck_tile::DeviceMem>(aq_tensor_ptr->get_element_space_size_in_bytes());
+    }
+
+    std::unique_ptr<ck_tile::DeviceMem> bq_dev_buf_ptr = nullptr;
+    if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped ||
+                 QuantMode == ck_tile::QuantType::RowColQuant ||
+                 QuantMode == ck_tile::QuantType::TensorQuant)
+    {
+        bq_dev_buf_ptr =
+            std::make_unique<ck_tile::DeviceMem>(bq_tensor_ptr->get_element_space_size_in_bytes());
+    }
+
+    if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped ||
+                 QuantMode == ck_tile::QuantType::RowColQuant ||
+                 QuantMode == ck_tile::QuantType::TensorQuant)
+    {
+        if constexpr(GemmConfig::PreshuffleQuant)
+        {
+            ck_tile::HostTensor<AQDataType> aq_shuffle_host =
+                shuffle_aq(aq_tensor_ptr.get(), GemmConfig::K_Tile / QuantGroupSize);
+            aq_dev_buf_ptr->ToDevice(aq_shuffle_host.data());
+        }
+        else
+        {
+            aq_dev_buf_ptr->ToDevice(aq_tensor_ptr->data());
+        }
+    }
+
+    if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
+    {
+        // Permute vector pk_i4x4 data for device implementation
+        ck_tile::HostTensor<ADataType> a_m_k_dev = a_m_k;
+        ck_tile::permute_vectors_i4x4_b(a_m_k_dev);
+        a_m_k_dev_buf.ToDevice(a_m_k_dev.data());
+    }
+    else
+    {
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+    }
+
+    ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+    {
+
+        if constexpr(GemmConfig::PreshuffleB)
+        {
+            b_k_n_dev = shuffle_b<GemmConfig>(b_k_n);
+        }
+        ck_tile::permute_vectors_i4x4_b(b_k_n_dev);
+        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+    }
+    else
+    {
+        if constexpr(GemmConfig::PreshuffleB)
+        {
+            b_k_n_dev = shuffle_b<GemmConfig>(b_k_n);
+        }
+        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+    }
+
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped ||
+                 QuantMode == ck_tile::QuantType::RowColQuant ||
+                 QuantMode == ck_tile::QuantType::TensorQuant)
+    {
+        bq_dev_buf_ptr->ToDevice(bq_tensor_ptr->data());
+    }
+
+    invoke_gemm<GemmConfig,
+                TypeConfig,
+                ALayout,
+                AQLayout,
+                BLayout,
+                BQLayout,
+                CLayout,
+                QuantGroupSize,
+                QuantMode>(a_m_k_dev_buf,
+                           aq_dev_buf_ptr.get(),
+                           b_k_n_dev_buf,
+                           bq_dev_buf_ptr.get(),
+                           c_m_n_dev_buf,
+                           M,
+                           N,
+                           K,
+                           AQK,
+                           BQK,
+                           stride_A,
+                           stride_AQ,
+                           stride_B,
+                           stride_BQ,
+                           stride_C,
+                           kbatch,
+                           n_warmup,
+                           n_repeat,
+                           flush_cache,
+                           rotating_count);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
+        {
+            ck_tile::reference_gemm_quant<ADataType,
+                                          AQDataType,
+                                          BDataType,
+                                          AccDataType,
+                                          CDataType,
+                                          QuantGroupSize,
+                                          true>(a_m_k, *aq_tensor_ptr, b_k_n, c_m_n_host_ref);
+        }
+        else if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
+        {
+            ck_tile::reference_gemm_quant<ADataType,
+                                          AQDataType,
+                                          BDataType,
+                                          AccDataType,
+                                          CDataType,
+                                          QuantGroupSize,
+                                          false>(a_m_k, *bq_tensor_ptr, b_k_n, c_m_n_host_ref);
+        }
+        else if constexpr(QuantMode == ck_tile::QuantType::RowColQuant)
+        {
+            ck_tile::reference_gemm_rowcol_quant<ADataType,
+                                                 AQDataType,
+                                                 BDataType,
+                                                 BQDataType,
+                                                 AccDataType,
+                                                 CDataType>(
+                a_m_k, *aq_tensor_ptr, b_k_n, *bq_tensor_ptr, c_m_n_host_ref);
+        }
+        else if constexpr(QuantMode == ck_tile::QuantType::TensorQuant)
+        {
+            ck_tile::reference_gemm_tensor_quant<ADataType,
+                                                 AQDataType,
+                                                 BDataType,
+                                                 BQDataType,
+                                                 AccDataType,
+                                                 CDataType>(
+                a_m_k, *aq_tensor_ptr, b_k_n, *bq_tensor_ptr, c_m_n_host_ref);
+        }
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        if(!pass)
+        {
+            std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        std::cout << "GPU verification is not implemented yet. Re-run with -v=1" << std::endl;
+        return false;
+    }
+
+    return pass;
+}
diff --git a/example/ck_tile/39_copy/test_tile_example.sh b/example/ck_tile/39_copy/test_tile_example.sh
index fcd8c8e991..416338fac4 100755
--- a/example/ck_tile/39_copy/test_tile_example.sh
+++ b/example/ck_tile/39_copy/test_tile_example.sh
@@ -1,4 +1,7 @@
 #!/usr/bin/env bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 set -euo pipefail
 
 BIN="${BIN:-../../../build/bin/tile_example_copy}"
diff --git a/example/ck_tile/40_streamk_gemm/CMakeLists.txt b/example/ck_tile/40_streamk_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..3539dee05b
--- /dev/null
+++ b/example/ck_tile/40_streamk_gemm/CMakeLists.txt
@@ -0,0 +1,5 @@
+if(GPU_TARGETS MATCHES "gfx9")
+    add_executable(tile_example_streamk_gemm_basic EXCLUDE_FROM_ALL streamk_gemm_basic.cpp)
+else()
+    message(DEBUG "Skipping ck_tile streamk gemm tests for current target")
+endif()
diff --git a/example/ck_tile/40_streamk_gemm/README.md b/example/ck_tile/40_streamk_gemm/README.md
new file mode 100644
index 0000000000..d2ff7eabc0
--- /dev/null
+++ b/example/ck_tile/40_streamk_gemm/README.md
@@ -0,0 +1,37 @@
+# Stream-K GEMM
+
+This folder contains examples of Stream-K GEMMs using the ck_tile tile-programming implementation.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx942) or leave it blank
+../script/cmake-ck-dev.sh  ../ <arch>
+# Compile the Stream-K kernels
+make tile_example_streamk_gemm_basic -j
+```
+This will result in an executable `build/bin/tile_example_streamk_gemm_basic`
+
+## example
+```
+args:
+                 -m    m dimension (default:512)
+                 -n    n dimension (default:512)
+                 -k    k dimension (default:512)
+          -a_layout    tensor A data layout (default: R)
+          -b_layout    tensor B data layout (default: C)
+          -c_layout    tensor C data layout (default: R)
+     -num_sk_blocks    number of Stream-K blocks. -1: chosen by algorithm, or user selected (default:-1)
+-reduction_strategy    strategy for storing results in C tensor. atomic/reduction (default:atomic)
+          -stride_a    tensor A stride (default:0)
+          -stride_b    tensor B stride (default:0)
+          -stride_c    tensor C stride (default:0)
+                 -v    validation strategy. 0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:1)
+              -prec    data type. fp16/bf16 (default:fp16)
+            -warmup    number of iterations before benchmarking the kernel (default:50)
+            -repeat    number of iterations to benchmark the kernel (default:100)
+             -timer    timing mode. gpu:gpu timer, cpu:cpu timer (default:gpu)
+              -init    data initialization strategy. 0:random, 1:linear, 2:constant(1) (default:0)
+       -flush_cache    flush the cache before running the kernel (default:true)
+```
\ No newline at end of file
diff --git a/example/ck_tile/40_streamk_gemm/gemm_utils.hpp b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp
new file mode 100644
index 0000000000..e698539eea
--- /dev/null
+++ b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp
@@ -0,0 +1,106 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = true;
+    static constexpr bool kPadN = true;
+    static constexpr bool kPadK = true;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+    static constexpr bool Persistent            = false;
+
+    static constexpr int kBlockPerCu                = 1;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+    static constexpr bool DoubleSmemBuffer          = false;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryInterwave : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 32;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave;
+};
+
+template <typename ADataType_, typename BDataType_ = ADataType_, typename CDataType_ = ADataType_>
+struct StreamKGemmTypeConfig
+{
+    using ADataType   = ADataType_;
+    using BDataType   = BDataType_;
+    using AccDataType = float;
+    using CDataType   = CDataType_;
+};
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "512", "m dimension")
+        .insert("n", "512", "n dimension")
+        .insert("k", "512", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Column by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("num_sk_blocks",
+                "-1",
+                "number of Stream-K blocks. -1: chosen by algorithm, or user selected")
+        .insert("reduction_strategy",
+                "atomic",
+                "strategy for storing results in C tensor - atomic/reduction")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16")
+        .insert("warmup", "50", "number of iterations before benchmarking the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
diff --git a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
new file mode 100644
index 0000000000..5fdf6b29ef
--- /dev/null
+++ b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
@@ -0,0 +1,351 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+#pragma once
+
+// Estimate the number of WGs contributing to the same macro tile in C
+template <ck_tile::StreamKReductionStrategy ReductionStrategy, typename TilePartitioner>
+int estimate_num_wgs_per_tile(const TilePartitioner& tile_partitioner)
+{
+    // In the case of non-atomic reduction or DP only, there will always be 1 WG contributing to a
+    // macro time in C
+    int num_wgs_per_tile = 1;
+
+    // Otherwise, for atomics, multiple WGs may be contributing to the same macro tile in C
+    if(tile_partitioner.sk_num_blocks > 0 &&
+       ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
+    {
+        // Determine the number of iterations per WG for a given macro tile in C
+        uint32_t k_iters_per_block = tile_partitioner.k_iters_per_big_block - 1;
+
+        // Estimate the number of WGs per macro tile
+        num_wgs_per_tile = (tile_partitioner.k_iters_per_tile.get() / (k_iters_per_block)) +
+                           ((tile_partitioner.k_iters_per_tile.get() % k_iters_per_block) != 0);
+    }
+
+    return std::max(num_wgs_per_tile, 1);
+}
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout)
+{
+    return ck_tile::bool_constant<
+        std::is_same_v<ck_tile::remove_cvref_t<Layout>, ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to multiple WGs working in the same C macro tile
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough,
+          ck_tile::StreamKReductionStrategy ReductionStrategy>
+std::tuple<float, int> gemm(const ck_tile::StreamKHostArgs& args, const ck_tile::stream_config& s);
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+std::tuple<float, int> invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
+                                   ck_tile::DeviceMem& b_k_n_dev_buf,
+                                   ck_tile::DeviceMem& c_m_n_dev_buf,
+                                   ck_tile::index_t M,
+                                   ck_tile::index_t N,
+                                   ck_tile::index_t K,
+                                   ck_tile::index_t stride_A,
+                                   ck_tile::index_t stride_B,
+                                   ck_tile::index_t stride_C,
+                                   int n_warmup,
+                                   int n_repeat,
+                                   bool flush_cache,
+                                   ck_tile::StreamKReductionStrategy reduction_strategy,
+                                   uint32_t num_sk_blocks)
+{
+    ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
+                                  b_k_n_dev_buf.GetDeviceBuffer(),
+                                  c_m_n_dev_buf.GetDeviceBuffer(),
+                                  M,
+                                  N,
+                                  K,
+                                  stride_A,
+                                  stride_B,
+                                  stride_C,
+                                  reduction_strategy,
+                                  num_sk_blocks};
+
+    std::tuple<float, int> ave_time_and_batch;
+
+    if(args.reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic)
+    {
+        ave_time_and_batch = gemm<GemmConfig,
+                                  ADataType,
+                                  BDataType,
+                                  DsDataType,
+                                  AccDataType,
+                                  CDataType,
+                                  ALayout,
+                                  BLayout,
+                                  DsLayout,
+                                  CLayout,
+                                  CDEElementWise,
+                                  ck_tile::StreamKReductionStrategy::Atomic>(
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, flush_cache});
+    }
+    else /*Reduction*/
+    {
+        ave_time_and_batch = gemm<GemmConfig,
+                                  ADataType,
+                                  BDataType,
+                                  DsDataType,
+                                  AccDataType,
+                                  CDataType,
+                                  ALayout,
+                                  BLayout,
+                                  DsLayout,
+                                  CLayout,
+                                  CDEElementWise,
+                                  ck_tile::StreamKReductionStrategy::Reduction>(
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, flush_cache});
+    }
+
+    return ave_time_and_batch;
+}
+
+template <typename CDataType>
+bool do_verify(const ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+               const ck_tile::HostTensor<CDataType>& c_m_n_ref,
+               const ck_tile::tuple<double, double>& rtol_atol,
+               const char* variant)
+{
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_ref,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The " << variant << " verification result is:" << (pass ? "correct" : "fail")
+              << std::endl;
+    return pass;
+}
+
+ck_tile::StreamKReductionStrategy get_reduction_strategy_value(const std::string& strategy)
+{
+    if(strategy == "atomic")
+    {
+        return ck_tile::StreamKReductionStrategy::Atomic;
+    }
+    else if(strategy == "reduction")
+    {
+        return ck_tile::StreamKReductionStrategy::Reduction;
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported Stream-K reduction strategy !!!");
+    }
+}
+
+template <typename GemmConfig,
+          typename TypeConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+int run_gemm_example_with_layouts(int argc,
+                                  char* argv[],
+                                  const ALayout a_layout                  = ALayout{},
+                                  const BLayout b_layout                  = BLayout{},
+                                  [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    static_assert(!GemmConfig::Preshuffle, "Not implemented");
+    static_assert(!GemmConfig::UseStructuredSparsity, "Not implemented");
+    static_assert(!GemmConfig::PermuteA, "Not implemented");
+    static_assert(!GemmConfig::PermuteB, "Not implemented");
+
+    using ADataType   = typename TypeConfig::ADataType;
+    using BDataType   = typename TypeConfig::BDataType;
+    using AccDataType = typename TypeConfig::AccDataType;
+    using CDataType   = typename TypeConfig::CDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool flush_cache             = arg_parser.get_bool("flush_cache");
+
+    ck_tile::StreamKReductionStrategy reduction_strategy =
+        get_reduction_strategy_value(arg_parser.get_str("reduction_strategy"));
+    uint32_t num_sk_blocks = static_cast<uint32_t>(arg_parser.get_int("num_sk_blocks"));
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        b_k_n.SetZero();
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    b_k_n_dev_buf.ToDevice(b_k_n.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    auto [ave_time, num_wgs_per_tile] = invoke_gemm<GemmConfig,
+                                                    ADataType,
+                                                    BDataType,
+                                                    ck_tile::tuple<>,
+                                                    AccDataType,
+                                                    CDataType,
+                                                    ALayout,
+                                                    BLayout,
+                                                    ck_tile::tuple<>,
+                                                    CLayout>(a_m_k_dev_buf,
+                                                             b_k_n_dev_buf,
+                                                             c_m_n_dev_buf,
+                                                             M,
+                                                             N,
+                                                             K,
+                                                             stride_A,
+                                                             stride_B,
+                                                             stride_C,
+                                                             n_warmup,
+                                                             n_repeat,
+                                                             flush_cache,
+                                                             reduction_strategy,
+                                                             num_sk_blocks);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " A_Layout=" << ALayout::name << " B_Layout=" << BLayout::name
+              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
+              << " B_Type=" << DataTypeTraits<BDataType>::name
+              << " C_Type=" << DataTypeTraits<CDataType>::name
+              << " reduction_strategy=" << arg_parser.get_str("reduction_strategy") << " "
+              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    bool pass = true;
+
+    // Memory on host to store gpu reference result
+    ck_tile::HostTensor<CDataType> c_m_n_ref(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+    c_m_n_ref.SetZero();
+
+    if(arg_parser.get_int("v") == 1) // Validate on the CPU
+    {
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, num_wgs_per_tile, max_accumulated_value);
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "CPU");
+    }
+    else if(arg_parser.get_int("v") == 2) // Validate on the GPU
+    {
+        // Memory on device to store gpu reference result
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_ref.get_element_space_size_in_bytes());
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_ref.data());
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, num_wgs_per_tile, max_accumulated_value);
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "GPU");
+    }
+
+    return pass;
+}
diff --git a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
new file mode 100644
index 0000000000..bb6b1eb413
--- /dev/null
+++ b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
@@ -0,0 +1,193 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename CDEElementWise,
+          ck_tile::StreamKReductionStrategy ReductionStrategy>
+std::tuple<float, int> gemm(const ck_tile::StreamKHostArgs& args, const ck_tile::stream_config& s)
+
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using TilePartitioner = ck_tile::StreamKTilePartitioner<GemmShape, ReductionStrategy>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 GemmConfig::Persistent,
+                                                                 GemmConfig::NumWaveGroups,
+                                                                 GemmConfig::Preshuffle>;
+
+    const auto Run = [&](const auto memory_operation) -> std::tuple<float, int> {
+        // We create the GEMM pipeline without specifying has_hot_loop or tail_num.
+        // This is because num_loop can vary (a) per WG and (b) per iteration of the Stream-K
+        // while loop. Instead, has_hot_loop and tail_num are determined in the Stream-K
+        // Kernel's RunGemm function. This is a similar pattern used by grouped GEMM.
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           GemmConfig::Scheduler>;
+
+        using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<UniversalGemmProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation.value,
+                                             GemmConfig::NumWaveGroups>>;
+
+        using Kernel = ck_tile::StreamKKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+
+        auto kargs = Kernel::MakeKernelArgs(args);
+
+        dim3 grids  = Kernel::GridSize(kargs.tile_partitioner);
+        dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        // Function to clear the output C tensor results after each repetition of the kernel
+        auto clear_gemm_output = [&]() {
+            if(ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
+                hipGetErrorString(hipMemsetAsync(
+                    args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+        };
+
+        std::function<void()> preprocess = clear_gemm_output;
+
+        float ave_time = ck_tile::launch_kernel_time_mask(
+            s,
+            preprocess,
+            ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        int num_wgs_per_tile = estimate_num_wgs_per_tile<ReductionStrategy>(kargs.tile_partitioner);
+
+        return std::tuple{ave_time, num_wgs_per_tile};
+    };
+
+    if constexpr(ck_tile::StreamKReductionStrategy::Atomic == ReductionStrategy)
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              // Since we are doing stream K, in the case of
+                                              // atomics, multiple workgroups may write to the same
+                                              // output tile in the C tensor, so we must atomic add
+                                              // the results (not set)
+                                              ck_tile::memory_operation_enum::atomic_add>{});
+    }
+    else // We are using ck_tile::StreamKReductionStrategy::Reduction
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              // In this case, there is only ever 1 WG writing final
+                                              // results to each macro tile in the C tensor, so we
+                                              // can do a set.
+                                              ck_tile::memory_operation_enum::set>{});
+    }
+}
+
+template <typename GemmConfig, typename TypeConfig>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_gemm_example_with_layouts<GemmConfig, TypeConfig>(
+            argc, argv, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported layouts.");
+    }
+
+    return 0;
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "bf16")
+    {
+        using TypeConfig = StreamKGemmTypeConfig<ck_tile::bf16_t>;
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf16_t>, TypeConfig>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp16")
+    {
+        using TypeConfig = StreamKGemmTypeConfig<ck_tile::half_t>;
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, TypeConfig>(
+            a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+
+    return false;
+}
+
+int main(int argc, char* argv[])
+{
+    return !run_gemm_example<GemmConfigMemoryInterwave>(argc, argv);
+}
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 8fce70ba04..931f2d315f 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -21,6 +21,8 @@ add_subdirectory(18_flatmm)
 add_subdirectory(19_gemm_multi_d)
 add_subdirectory(20_grouped_convolution)
 add_subdirectory(21_elementwise)
+add_subdirectory(22_gemm_multi_abd)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(38_block_scale_gemm)
 add_subdirectory(39_copy)
+add_subdirectory(40_streamk_gemm)
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 09801203ba..7aee7fca28 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck/config.h"
+#include <stdint.h>
 
 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
@@ -50,10 +51,11 @@
 #endif
 
 // define general macros for various architectures
-#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__) || \
+    defined(__gfx9_4_generic__)
 #define __gfx9__
 #endif
-#if defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx9_4_generic__)
 #define __gfx94__
 #endif
 #if defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__)
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index 2bc5a4414e..0c4f056a46 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -68,11 +68,34 @@ inline bool is_gfx11_supported()
 inline bool is_xdl_supported()
 {
     return ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-           ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"
-#if defined(CK_ENABLE_DYNAMIC_WARP_SIZE)
-           || is_gfx12_supported() || is_gfx11_supported()
-#endif
-        ;
+           ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+           is_gfx12_supported() || is_gfx11_supported();
+}
+
+template <typename ADataType, typename BDataType, index_t MPerXDL, index_t NPerXDL>
+inline bool is_xdl_wmma_supported()
+{
+    if(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+       ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950")
+    {
+        return true;
+    }
+    else if(is_gfx12_supported() || is_gfx11_supported())
+    {
+        if constexpr((MPerXDL != 16) || (NPerXDL != 16))
+        {
+            return false;
+        }
+        if constexpr(sizeof(ADataType) > 2 || sizeof(BDataType) > 2)
+        {
+            return false;
+        }
+        return true;
+    }
+    else
+    {
+        return false;
+    }
 }
 
 inline bool is_lds_direct_load_supported()
@@ -101,5 +124,12 @@ inline bool is_gfx103_supported()
            ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036";
 }
 
+inline bool is_wmma_supported()
+{
+    return is_gfx103_supported() || is_gfx11_supported() || is_gfx12_supported();
+}
+
+inline bool is_tf32_supported() { return (ck::get_device_name() == "gfx942") ? true : false; }
+
 } // namespace ck
 #endif
diff --git a/include/ck/host_utility/flush_cache.hpp b/include/ck/host_utility/flush_cache.hpp
index 08b3aba2b3..5da447125e 100644
--- a/include/ck/host_utility/flush_cache.hpp
+++ b/include/ck/host_utility/flush_cache.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -15,6 +15,151 @@
 namespace ck {
 namespace utility {
 
+template <typename Argument, typename AsDataType, typename BsDataType, typename DsDataType>
+struct RotatingMemWrapperMultiABD
+{
+    static constexpr index_t NumAs = AsDataType::Size();
+    static constexpr index_t NumBs = BsDataType::Size();
+    static constexpr index_t NumDs = DsDataType::Size();
+
+    using AsGridPointer = decltype(Argument::p_as_grid);
+    using BsGridPointer = decltype(Argument::p_bs_grid);
+    using DsGridPointer = decltype(Argument::p_ds_grid);
+
+    RotatingMemWrapperMultiABD() = delete;
+    RotatingMemWrapperMultiABD(Argument& arg_,
+                               std::size_t rotating_count_,
+                               std::array<std::size_t, NumAs> size_as_,
+                               std::array<std::size_t, NumBs> size_bs_,
+                               std::array<std::size_t, NumDs> size_ds_)
+        : arg(arg_),
+          rotating_count(rotating_count_),
+          size_as(size_as_),
+          size_bs(size_bs_),
+          size_ds(size_ds_)
+    {
+        p_as_grids.push_back(arg.p_as_grid);
+        p_bs_grids.push_back(arg.p_bs_grid);
+        p_ds_grids.push_back(arg.p_ds_grid);
+        for(size_t i = 1; i < rotating_count; i++)
+        {
+            {
+                AsGridPointer as_buffer;
+                static_for<0, NumAs, 1>{}([&](auto j) {
+                    void* pADeviceBuf;
+                    hip_check_error(hipMalloc(static_cast<void**>(&pADeviceBuf), size_as_[j]));
+                    hip_check_error(hipMemcpy(static_cast<void*>(pADeviceBuf),
+                                              static_cast<const void*>(p_as_grids[0][j]),
+                                              size_as_[j],
+                                              hipMemcpyDeviceToDevice));
+                    using ADataType = remove_cvref_t<tuple_element_t<j.value, AsDataType>>;
+
+                    as_buffer(j) = static_cast<const ADataType*>(pADeviceBuf);
+                });
+                p_as_grids.push_back(as_buffer);
+            }
+
+            {
+                BsGridPointer bs_buffer;
+                static_for<0, NumBs, 1>{}([&](auto j) {
+                    void* pBDeviceBuf;
+                    hip_check_error(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_bs_[j]));
+                    hip_check_error(hipMemcpy(static_cast<void*>(pBDeviceBuf),
+                                              static_cast<const void*>(p_bs_grids[0][j]),
+                                              size_bs_[j],
+                                              hipMemcpyDeviceToDevice));
+                    using BDataType = remove_cvref_t<tuple_element_t<j.value, BsDataType>>;
+
+                    bs_buffer(j) = static_cast<const BDataType*>(pBDeviceBuf);
+                });
+                p_bs_grids.push_back(bs_buffer);
+            }
+
+            {
+                DsGridPointer ds_buffer;
+                static_for<0, NumDs, 1>{}([&](auto j) {
+                    void* pDDeviceBuf;
+                    hip_check_error(hipMalloc(static_cast<void**>(&pDDeviceBuf), size_ds_[j]));
+                    hip_check_error(hipMemcpy(static_cast<void*>(pDDeviceBuf),
+                                              static_cast<const void*>(p_ds_grids[0][j]),
+                                              size_ds_[j],
+                                              hipMemcpyDeviceToDevice));
+
+                    using DDataType = remove_cvref_t<tuple_element_t<j.value, DsDataType>>;
+
+                    ds_buffer(j) = static_cast<const DDataType*>(pDDeviceBuf);
+                });
+
+                p_ds_grids.push_back(ds_buffer);
+            }
+        }
+    }
+
+    void Next()
+    {
+        if(rotating_count > 1)
+        {
+            std::size_t idx = iter++ % rotating_count;
+            arg.p_as_grid   = p_as_grids[idx];
+            arg.p_bs_grid   = p_bs_grids[idx];
+            arg.p_ds_grid   = p_ds_grids[idx];
+        }
+    }
+    void Print()
+    {
+        std::cout << "RotatingMemWrapperMultiD: { size_a: {";
+        static_for<0, NumAs, 1>{}(
+            [&](auto j) { std::cout << size_as[j] << (j.value < NumAs - 1 ? ", " : ""); });
+        std::cout << "}, size_b: {";
+        static_for<0, NumBs, 1>{}(
+            [&](auto j) { std::cout << size_bs[j] << (j.value < NumBs - 1 ? ", " : ""); });
+        std::cout << "}, rotating_count: " << rotating_count << "}" << std::endl;
+    }
+    ~RotatingMemWrapperMultiABD()
+    {
+        if(rotating_count > 1)
+        {
+            // restore ptr
+            arg.p_as_grid = p_as_grids[0];
+            arg.p_bs_grid = p_bs_grids[0];
+            arg.p_ds_grid = p_ds_grids[0];
+
+            // free device mem
+            for(size_t i = 1; i < rotating_count; i++)
+            {
+                static_for<0, NumAs, 1>{}([&](auto j) {
+                    using ADataType = remove_cvref_t<tuple_element_t<j.value, AsDataType>>;
+                    hip_check_error(
+                        hipFree(static_cast<void*>(const_cast<ADataType*>(p_as_grids[i][j]))));
+                });
+
+                static_for<0, NumBs, 1>{}([&](auto j) {
+                    using BDataType = remove_cvref_t<tuple_element_t<j.value, BsDataType>>;
+                    hip_check_error(
+                        hipFree(static_cast<void*>(const_cast<BDataType*>(p_bs_grids[i][j]))));
+                });
+
+                static_for<0, NumDs, 1>{}([&](auto j) {
+                    using DDataType = remove_cvref_t<tuple_element_t<j.value, DsDataType>>;
+                    hip_check_error(
+                        hipFree(static_cast<void*>(const_cast<DDataType*>(p_ds_grids[i][j]))));
+                });
+            }
+        }
+    }
+
+    private:
+    Argument& arg;
+    std::size_t iter                       = 0;
+    std::size_t rotating_count             = 1;
+    std::array<std::size_t, NumAs> size_as = {0};
+    std::array<std::size_t, NumBs> size_bs = {0};
+    std::array<std::size_t, NumDs> size_ds = {0};
+    std::vector<AsGridPointer> p_as_grids;
+    std::vector<BsGridPointer> p_bs_grids;
+    std::vector<DsGridPointer> p_ds_grids;
+};
+
 template <typename Argument, typename DsDataType>
 struct RotatingMemWrapperMultiD
 {
@@ -318,6 +463,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
             //             total_time += cur_time;
             // #endif
 
+#if !defined(CK_USE_WMMA)
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
                 // std::cout << "i: " << i << " cur_time: " << cur_time << std::endl;
@@ -326,6 +472,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
                        static_cast<const void*>(gemm_args.p_a_grid),
                        static_cast<const void*>(gemm_args.p_b_grid));
             }
+#endif
         }
         hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
         hip_check_error(hipEventSynchronize(stop));
diff --git a/include/ck/library/utility/check_err.hpp b/include/ck/library/utility/check_err.hpp
index d33ecaeef8..185166f7ec 100644
--- a/include/ck/library/utility/check_err.hpp
+++ b/include/ck/library/utility/check_err.hpp
@@ -180,13 +180,13 @@ check_err(const Range& out,
         if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
         {
             max_err = err > max_err ? err : max_err;
-            err_count++;
             if(err_count < 5)
             {
                 std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
                           << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
             }
             res = false;
+            err_count++;
         }
     }
     if(!res)
diff --git a/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp b/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
index d4ceefb458..e8d33f4216 100644
--- a/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
+++ b/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
@@ -203,8 +203,11 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck::utils::conv::ConvPa
     }
 
     return transpose_host_tensor_descriptor_given_new2old(
-        HostTensorDescriptor(physical_lengths),
-        detail::get_layout_transpose_gnchw_to_old<InLayout>());
+        // TBD: specify explicit conv layout rather than base one
+        HostTensorDescriptor(physical_lengths,
+                             ck::tensor_layout::convolution::BaseConvolutionLayout{}),
+        detail::get_layout_transpose_gnchw_to_old<InLayout>(),
+        InLayout{});
 }
 
 // make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX
@@ -296,8 +299,10 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvPa
     }
 
     return transpose_host_tensor_descriptor_given_new2old(
-        HostTensorDescriptor(physical_lengths),
-        detail::get_layout_transpose_gnchw_to_old<WeiLayout>());
+        HostTensorDescriptor(physical_lengths,
+                             ck::tensor_layout::convolution::BaseConvolutionLayout{}),
+        detail::get_layout_transpose_gnchw_to_old<WeiLayout>(),
+        WeiLayout{});
 }
 
 // make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW
@@ -386,8 +391,10 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck::utils::conv::ConvP
     }
 
     return transpose_host_tensor_descriptor_given_new2old(
-        HostTensorDescriptor(physical_lengths),
-        detail::get_layout_transpose_gnchw_to_old<OutLayout>());
+        HostTensorDescriptor(physical_lengths,
+                             ck::tensor_layout::convolution::BaseConvolutionLayout{}),
+        detail::get_layout_transpose_gnchw_to_old<OutLayout>(),
+        OutLayout{});
 }
 
 } // namespace conv
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index fb8f6e79dc..55505524e0 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -21,6 +21,8 @@
 #include "ck/library/utility/ranges.hpp"
 #include "ck/library/utility/thread.hpp"
 
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
 template <typename Range>
 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
 {
@@ -97,59 +99,455 @@ auto construct_f_unpack_args(F, T args)
     return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
 }
 
+/**
+ * @brief A descriptor class for host tensors that manages tensor dimensions, strides, and layout.
+ *
+ * The HostTensorDescriptor provides a comprehensive interface for describing multi-dimensional
+ * tensors with configurable layouts and automatic stride calculation capabilities.
+ *
+ * @section stride_handling Stride Handling
+ *
+ * The descriptor supports multiple stride specification modes:
+ *
+ * 1. **Explicit Strides**: When strides are provided explicitly, they are validated against
+ *    the specified layout to ensure memory access patterns are correct.
+ *
+ * 2. **Auto-calculated Strides**: When strides are empty or all-zero, they are automatically
+ *    calculated based on the tensor layout:
+ *    - For RowMajor layout: rightmost dimension has stride 1, others calculated as cumulative
+ * products
+ *    - For ColumnMajor layout: similar to RowMajor but with swapped stride positions for last two
+ * dimensions
+ *
+ * 3. **Partial Stride Specification**: For GEMM layouts, unknown strides (represented as 0 or
+ * negative values) in the last two dimensions can be auto-calculated while preserving higher
+ * dimension strides.
+ *
+ * 4. **Bypass**: When using `BypassLayoutVerification` layout, no stride calculation or validation
+ * is performed. That allows to pass in any arbitrary strides including 0.
+ *
+ * For more details see `CalculateStrides` method.
+ *
+ * @section layout_support Layout Support
+ *
+ * - **GEMM Layouts**: Supports RowMajor and ColumnMajor layouts with full validation
+ * - **Convolution Layouts**: Recognized but validation is not yet implemented
+ * - **Abstract Layouts**: BaseTensorLayout will attempt automatic layout detection for 2D tensors
+ *
+ * @section limitations Limitations
+ *
+ * 1. **Layout Detection**: Automatic layout detection only works reliably for 2D tensors.
+ *    This is done mostly for legacy GEMM cases to avoid modifying many existing GEMM tests to pass
+ *    RowMajor/ColumnMajor explicitly. Higher-dimensional tensors with BaseTensorLayout will throw
+ *    validation errors. For more details see `HandleDefaultLayout` method.
+ *
+ * 2. **Stride Validation**: Only GEMM layouts (RowMajor/ColumnMajor) have full stride validation.
+ *    Convolution layouts are accepted but not validated. For more details see `ValidateStrides`.
+ *
+ * 3. **GEMM Assumptions**: For tensors with more than 2 dimensions, GEMM layout validation
+ *    assumes the last two dimensions represent the height-width pattern (e.g., BHW or BWH for
+ * batched GEMM).
+ *
+ * 4. **Negative Stride Handling**: Negative stride values are interpreted as "unknown" and
+ *    converted to auto-calculated values only for supported layouts.
+ *
+ * @section thread_safety Thread Safety
+ * This class is not thread-safe. External synchronization is required for concurrent access.
+ *
+ * @section examples Usage Examples
+ *
+ * ```cpp
+ * // Auto-calculate strides for RowMajor layout
+ * HostTensorDescriptor desc1({4, 3}, ck::tensor_layout::gemm::RowMajor{});
+ *
+ * // Explicit strides with validation
+ * HostTensorDescriptor desc2({4, 3}, {3, 1}, ck::tensor_layout::gemm::RowMajor{});
+ *
+ * // Partial stride specification (auto-calculate unknown dimension)
+ * HostTensorDescriptor desc3({4, 3}, {0, 1}, ck::tensor_layout::gemm::RowMajor{});
+ * ```
+ */
 struct HostTensorDescriptor
 {
-    HostTensorDescriptor() = default;
+    using BaseTensorLayout = ck::tensor_layout::BaseTensorLayout;
+    using DefaultLayout    = BaseTensorLayout;
 
-    void CalculateStrides();
-
-    template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
-    HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
+    // Runtime tag describing which layout is picked when layout is not specified explicitly at
+    // construction time.
+    enum class ChosenLayout
     {
-        this->CalculateStrides();
+        Original,
+        RowMajor,
+        ColumnMajor
+    };
+
+    // Master constructor
+    template <typename Layout>
+    HostTensorDescriptor(std::vector<std::size_t> lens,
+                         std::vector<std::size_t> strides,
+                         const Layout& layout = DefaultLayout())
+        : mLens(std::move(lens)), mStrides(std::move(strides))
+    {
+        // To support legacy use cases, when layout is not passed in
+        const auto new_layout = HandleDefaultLayout(layout);
+        if(dbg)
+        {
+            std::cout << "Original Lens: [";
+            LogRange(std::cout, mLens, ", ") << "] and Strides: [";
+            LogRange(std::cout, mStrides, ", ") << "]" << std::endl;
+            std::cout << "Layout: " << layout << " --> " << new_layout << std::endl;
+        }
+
+        // Handling the strides and validation based on the chosen layout
+        DispatchChosenLayout(new_layout, layout, [&](auto selected_layout) {
+            this->CalculateStrides(selected_layout);
+            this->ValidateStrides(selected_layout);
+        });
     }
 
-    HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens)
-        : mLens(lens.begin(), lens.end())
+    HostTensorDescriptor() : HostTensorDescriptor({}, {}, DefaultLayout()){};
+
+    // Helper that invokes a callable with a concrete layout object whose type
+    // matches the chosen tag (so template code depending on the layout type
+    // can still leverage if constexpr branches).
+    template <typename F, typename OrigLayout>
+    void DispatchChosenLayout(ChosenLayout tag, const OrigLayout& orig, F&& f) const
     {
-        this->CalculateStrides();
+        switch(tag)
+        {
+        case ChosenLayout::RowMajor: f(ck::tensor_layout::gemm::RowMajor{}); break;
+        case ChosenLayout::ColumnMajor: f(ck::tensor_layout::gemm::ColumnMajor{}); break;
+        case ChosenLayout::Original:
+        default: f(orig); break;
+        }
+    }
+
+    template <typename Layout>
+    ChosenLayout HandleDefaultLayout(const Layout&)
+    {
+        if constexpr(!std::is_same_v<Layout, DefaultLayout>)
+        {
+            return ChosenLayout::Original;
+        }
+        else
+        {
+            if(mStrides.empty())
+            {
+                // No strides provided -> assume RowMajor
+                return ChosenLayout::RowMajor;
+            }
+
+            const auto rank = mLens.size();
+
+            if(rank > 2)
+            {
+                // Keep as-is - validation will warn/throw later
+                return ChosenLayout::Original;
+            }
+
+            if(rank == 0)
+            {
+                // Keep as-is - validation will warn/throw later
+                return ChosenLayout::Original;
+            }
+
+            if(rank == 1)
+            {
+                // Treat 1D tensor as RowMajor
+                return ChosenLayout::RowMajor;
+            }
+
+            // rank == 2
+            if(mStrides.size() == 2)
+            {
+                // RowMajor pattern (?, 1)
+                if(mStrides[1] == 1)
+                {
+                    return ChosenLayout::RowMajor;
+                }
+
+                // ColumnMajor pattern (1, ?)
+                if(mStrides[0] == 1)
+                {
+                    return ChosenLayout::ColumnMajor;
+                }
+            }
+
+            // Fallback: leave as-is
+            return ChosenLayout::Original;
+        }
+    }
+
+    template <typename Layout>
+    void CalculateStrides(const Layout& layout)
+    {
+        if constexpr(std::is_same_v<Layout, ck::tensor_layout::BypassLayoutVerification>)
+            return;
+        // This is a workaround if the original stride value is -1 (which means "unknown") has been
+        // passed in and casted to size_t (unsigned).
+        auto strides_int = AsInt(mStrides);
+
+        // case of empty strides or all-zero: auto-calculate based on layout and tensor dimensions
+        if(mStrides.empty() || std::all_of(strides_int.begin(), strides_int.end(), [](int stride) {
+               return stride <= 0;
+           }))
+        {
+
+            if constexpr(!(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
+                           std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>))
+            {
+                std::cerr << "Only RowMajor and ColumnMajor layouts are supported for empty "
+                             "strides, got "
+                          << layout << ". Will calculate strides as RowMajor." << std::endl;
+            }
+
+            mStrides.clear();
+            mStrides.resize(mLens.size(), 0);
+            if(mStrides.empty())
+                return;
+
+            mStrides.back() = 1;
+            std::partial_sum(mLens.rbegin(),
+                             mLens.rend() - 1,
+                             mStrides.rbegin() + 1,
+                             std::multiplies<std::size_t>());
+
+            if constexpr(std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
+            {
+                // swap the last two strides
+                if(mStrides.size() >= 2)
+                    std::swap(mStrides[mStrides.size() - 1], mStrides[mStrides.size() - 2]);
+            }
+        }
+        // The other case is if one of the strides is unknown
+        // Currently, only GEMM RowMajor and ColumnMajor layouts are supported and only in the lower
+        // two dimensions, e.g. {..., 0, N} or {..., M, 0}. The higher dimensions are left
+        // untouched.
+        else if constexpr(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
+                          std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
+        {
+            auto rank = mStrides.size();
+            if(mLens.size() >= 2 && rank >= 2)
+            {
+                const auto inner_idx =
+                    std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ? rank - 1 : rank - 2;
+                const auto outer_idx = inner_idx == rank - 1 ? rank - 2 : rank - 1;
+                if(mStrides[inner_idx] <= 0)
+                {
+                    mStrides[inner_idx] = 1;
+                }
+                if(mStrides[outer_idx] <= 0)
+                {
+                    mStrides[outer_idx] = mLens[inner_idx] * mStrides[inner_idx];
+                }
+            }
+        }
+    }
+
+    template <typename Layout>
+    void ValidateStrides(const Layout& layout) const
+    {
+        if constexpr(std::is_same_v<ck::tensor_layout::BypassLayoutVerification, Layout>)
+        {
+            return;
+        }
+
+        if(mLens.empty())
+        {
+            throw std::runtime_error(
+                "HostTensorDescriptor::ValidateStrides: empty tensor dimensions is not allowed.");
+        }
+
+        const int rank = mLens.size();
+        if(rank == 1) // skip any 1D tensors
+        {
+            return;
+        }
+
+        if constexpr(std::is_same_v<ck::tensor_layout::BaseTensorLayout, Layout>)
+        {
+            // Any legacy code that doesn't pass layout to HostTensorDescriptor ctor will
+            // hit this case (unless it is a special case - see `HandleDefaultLayout`).
+            throw std::runtime_error("HostTensorDescriptor::ValidateStrides: Abstract tensor "
+                                     "layout BaseTensorLayout can't be verified. Pls "
+                                     "pass specific tensor layout to HostTensorDescriptor (or "
+                                     "ck::tensor_layout::BypassLayoutVerification)");
+        }
+
+        // GEMM cases
+        if constexpr(std::is_base_of_v<ck::tensor_layout::gemm::BaseGemmLayout, Layout>)
+        {
+            if(mLens.size() != mStrides.size())
+            {
+                std::ostringstream oss;
+                oss << "HostTensorDescriptor::ValidateStrides: mismatch between tensor rank and "
+                       "size of strides: "
+                    << *this;
+                throw std::runtime_error(oss.str());
+            }
+
+            // in GEMM, strides must be all positive or all zeros (auto-derived from tensor
+            // dimensions)
+            auto strides_int = AsInt(mStrides);
+            if(std::any_of(
+                   strides_int.begin(), strides_int.end(), [](int stride) { return stride <= 0; }))
+            {
+                std::ostringstream oss;
+                oss << "Stride values must be positive or all-zeros (auto-derived from tensor "
+                       "dimensions). Instead got ";
+                std::copy(
+                    strides_int.begin(), strides_int.end(), std::ostream_iterator<int>(oss, " "));
+                throw std::runtime_error(oss.str());
+            }
+
+            if constexpr(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
+                         std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
+            {
+                // The logic here assumes the GEMM with tensor of more than 2 dims, will always have
+                // HW dimesnsions as the inner ones e.g. batched GEMM is either BHW or BWH
+                const auto inner_idx =
+                    std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ? rank - 1 : rank - 2;
+                const auto outer_idx = inner_idx == rank - 1 ? rank - 2 : rank - 1;
+
+                if(mStrides[outer_idx] < mLens[inner_idx] * mStrides[inner_idx])
+                {
+                    std::ostringstream oss;
+                    oss << "Invalid strides for " << layout << ": " << *this;
+                    throw std::runtime_error(oss.str());
+                }
+
+                // For higher dimensions, validate strides assuming RowMajor
+                for(int i = 1; i < rank - 2; ++i)
+                {
+                    if(mStrides[i - 1] < mStrides[i] * mLens[i])
+                    {
+                        std::ostringstream oss;
+                        oss << "Invalid strides for higher dimensions in " << layout << ": "
+                            << *this;
+                        throw std::runtime_error(oss.str());
+                    }
+                }
+            }
+            else
+            {
+                std::ostringstream oss;
+                oss << "Error: Unsupported GEMM layout: " << layout;
+                throw std::runtime_error(oss.str());
+            }
+        }
+        // Convolution cases
+        else if constexpr(std::is_base_of_v<ck::tensor_layout::convolution::BaseConvolutionLayout,
+                                            Layout>)
+        {
+            // TBD: implement verification for Conv layouts
+            // For now, just print warning and return
+            std::cerr << "Warning: Tensor layout verification for ck::tensor_layout::convolution "
+                         "layouts is not supported yet. Skipping..."
+                      << std::endl;
+            return;
+        }
+        else
+        {
+            std::ostringstream oss;
+            oss << "Error: Tensor layout verification for " << layout << " is not supported yet.";
+            throw std::runtime_error(oss.str());
+        }
+    }
+
+    template <typename X,
+              typename Layout = DefaultLayout,
+              typename        = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
+                                                 std::is_convertible_v<Layout, BaseTensorLayout>>>
+    HostTensorDescriptor(const std::initializer_list<X>& lens, const Layout& layout = Layout{})
+        : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
+    {
+        if(dbg)
+            std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
+    }
+
+    template <typename Layout = DefaultLayout,
+              typename        = std::enable_if_t<std::is_convertible_v<Layout, BaseTensorLayout>>>
+    HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
+                         const Layout& layout = Layout{})
+        : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
+    {
+        if(dbg)
+            std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
     }
 
     template <typename Lengths,
-              typename = std::enable_if_t<
-                  std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> ||
-                  std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t>>>
-    HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end())
+              typename Layout = DefaultLayout,
+              typename        = std::enable_if_t<
+                         (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> ||
+                   std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t>) &&
+                         std::is_convertible_v<Layout, BaseTensorLayout>>>
+    HostTensorDescriptor(const Lengths& lens, const Layout& layout = Layout{})
+        : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
     {
-        this->CalculateStrides();
+        if(dbg)
+            std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
     }
 
     template <typename X,
               typename Y,
-              typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
-                                          std::is_convertible_v<Y, std::size_t>>>
+              typename        = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
+                                                 std::is_convertible_v<Y, std::size_t>>,
+              typename Layout = DefaultLayout>
     HostTensorDescriptor(const std::initializer_list<X>& lens,
-                         const std::initializer_list<Y>& strides)
-        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+                         const std::initializer_list<Y>& strides,
+                         const Layout& layout = Layout{})
+        : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
+                               std::vector<std::size_t>(strides.begin(), strides.end()),
+                               layout)
     {
+        if(dbg)
+            std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
     }
 
+    // HostTensorDescriptor({row, col}, {row_stride, col_stride})
+    template <typename Layout = DefaultLayout>
     HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
-                         const std::initializer_list<ck::long_index_t>& strides)
-        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+                         const std::initializer_list<ck::long_index_t>& strides,
+                         const Layout& layout = Layout{})
+        : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
+                               std::vector<std::size_t>(strides.begin(), strides.end()),
+                               layout)
     {
+        if(dbg)
+            std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
+    }
+
+    // HostTensorDescriptor({row, col}, strides)
+    template <typename Strides, typename Layout = DefaultLayout>
+    HostTensorDescriptor(const std::initializer_list<std::size_t>& lens,
+                         const Strides& strides,
+                         const Layout& layout = Layout{})
+        : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
+                               std::vector<std::size_t>(strides.begin(), strides.end()),
+                               layout)
+    {
+        if(dbg)
+            std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
     }
 
     template <typename Lengths,
               typename Strides,
-              typename = std::enable_if_t<
-                  (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> &&
-                   std::is_convertible_v<ck::ranges::range_value_t<Strides>, std::size_t>) ||
-                  (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t> &&
-                   std::is_convertible_v<ck::ranges::range_value_t<Strides>, ck::long_index_t>)>>
-    HostTensorDescriptor(const Lengths& lens, const Strides& strides)
-        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+              typename Layout = DefaultLayout,
+              typename        = std::enable_if_t<
+                         ((std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> &&
+                    std::is_convertible_v<ck::ranges::range_value_t<Strides>, std::size_t>) ||
+                   (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t> &&
+                    std::is_convertible_v<ck::ranges::range_value_t<Strides>, ck::long_index_t>)) &&
+                         std::is_convertible_v<Layout, BaseTensorLayout>>>
+    HostTensorDescriptor(const Lengths& lens,
+                         const Strides& strides,
+                         const Layout& layout = Layout{})
+        : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
+                               std::vector<std::size_t>(strides.begin(), strides.end()),
+                               layout)
     {
+        if(dbg)
+            std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
     }
 
     std::size_t GetNumOfDimension() const;
@@ -173,15 +571,34 @@ struct HostTensorDescriptor
     }
 
     friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
+    friend std::ostream& operator<<(std::ostream& os, ChosenLayout tag);
 
     private:
     std::vector<std::size_t> mLens;
     std::vector<std::size_t> mStrides;
+    static constexpr bool dbg = false;
+
+    /**
+     * @brief Converts a vector of size_t values to a vector of int values.
+     *
+     * @param vec The input vector of size_t values to be converted.
+     * @return std::vector<int> A vector containing the converted int values.
+     */
+    std::vector<int> AsInt(const std::vector<size_t>& vec) const
+    {
+        std::vector<int> strides_int(vec.size());
+        std::transform(vec.begin(), vec.end(), strides_int.begin(), [](std::size_t stride) {
+            return static_cast<int>(stride);
+        });
+        return strides_int;
+    }
 };
 
-template <typename New2Old>
-HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor& a,
-                                                                    const New2Old& new2old)
+template <typename New2Old, typename NewLayout = HostTensorDescriptor::BaseTensorLayout>
+HostTensorDescriptor
+transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor& a,
+                                               const New2Old& new2old,
+                                               const NewLayout& new_layout = NewLayout())
 {
     std::vector<std::size_t> new_lengths(a.GetNumOfDimension());
     std::vector<std::size_t> new_strides(a.GetNumOfDimension());
@@ -192,7 +609,7 @@ HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTe
         new_strides[i] = a.GetStrides()[new2old[i]];
     }
 
-    return HostTensorDescriptor(new_lengths, new_strides);
+    return HostTensorDescriptor(new_lengths, new_strides, new_layout);
 }
 
 struct joinable_thread : std::thread
@@ -300,6 +717,36 @@ struct Tensor
     {
     }
 
+    template <typename X, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
+    Tensor(std::initializer_list<X> lens, Rest&&... rest)
+        : mDesc(lens, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
+    {
+    }
+
+    template <typename X,
+              typename Y,
+              typename... Rest,
+              std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
+    Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides, Rest&&... rest)
+        : mDesc(lens, strides, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
+    {
+    }
+
+    template <typename Lengths, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
+    Tensor(const Lengths& lens, Rest&&... rest)
+        : mDesc(lens, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
+    {
+    }
+
+    template <typename Lengths,
+              typename Strides,
+              typename... Rest,
+              std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
+    Tensor(const Lengths& lens, const Strides& strides, Rest&&... rest)
+        : mDesc(lens, strides, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
+    {
+    }
+
     Tensor(const Descriptor& desc) : mDesc(desc), mData(GetElementSpaceSize()) {}
 
     template <typename OutT>
diff --git a/include/ck/library/utility/validation_common.hpp b/include/ck/library/utility/validation_common.hpp
deleted file mode 100644
index 38933c6d7c..0000000000
--- a/include/ck/library/utility/validation_common.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include "ck/ck.hpp"
-#include "ck/utility/type.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-
-namespace ck {
-namespace utils {
-
-template <typename Layout>
-inline void
-validate_gemm_stride(int M, int N, int stride, const std::string& stride_name = "Stride")
-{
-    if(ck::is_same_v<Layout, ck::tensor_layout::gemm::ColumnMajor>)
-    {
-        if(stride < M)
-        {
-            throw std::runtime_error(
-                "Error: For ColumnMajor layout, " + stride_name + " (" + std::to_string(stride) +
-                ") must be greater than or equal to dim (" + std::to_string(M) + ")");
-        }
-    }
-    else // RowMajor
-    {
-        if(stride < N)
-        {
-            throw std::runtime_error(
-                "Error: For RowMajor layout, " + stride_name + " (" + std::to_string(stride) +
-                ") must be greater than or equal to dim (" + std::to_string(N) + ")");
-        }
-    }
-}
-
-// Convenience functions for common GEMM patterns
-template <typename ALayout, typename BLayout, typename CLayout>
-inline void validate_gemm_strides_abc(int M, int N, int K, int StrideA, int StrideB, int StrideC)
-{
-    validate_gemm_stride<ALayout>(M, K, StrideA, "StrideA");
-    validate_gemm_stride<BLayout>(K, N, StrideB, "StrideB");
-    validate_gemm_stride<CLayout>(M, N, StrideC, "StrideC");
-}
-
-} // namespace utils
-} // namespace ck
diff --git a/include/ck/tensor_description/multi_index_transform.hpp b/include/ck/tensor_description/multi_index_transform.hpp
index c152cbfb1e..e24227ecc3 100644
--- a/include/ck/tensor_description/multi_index_transform.hpp
+++ b/include/ck/tensor_description/multi_index_transform.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -1553,6 +1553,198 @@ struct UnMerge
     }
 };
 
+/**
+ * @brief Transformation struct for convolution backward data output indices to GEMM indices.
+ *
+ * This struct is responsible for mapping the output tensor indices (N, Ho, Wo, K) from the
+ * convolution backward data operation to the corresponding indices (K0, M, K1) used in the
+ * implicit GEMM computation. It encapsulates the necessary parameters and transformation logic
+ * required to efficiently perform the index conversion.
+ */
+struct ConvBwdDataImplicitGemmOutTransform
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    using LowerIndex = MultiIndex<4>; // N, Ho, Wo, K
+    using UpperIndex = MultiIndex<3>; // K0, M, K1
+
+    index_t N_, Ho_, Wo_, K_;
+    index_t XDot_;
+    index_t HTilde_, WTilde_;
+    index_t WTildeSlice_, TildeSlice_;
+    index_t IHTildeSliceBegin_, IWTildeSliceBegin_;
+    index_t HRatio_, WRatio_;
+    index_t XDotSlice_K_;
+    index_t MPad_, KPad_;
+    Tuple<index_t, index_t, index_t> up_lengths_; // K0_, MPadded, K1_;
+
+    Tuple<index_t, index_t, index_t, index_t>
+        low_lengths_magic_divisor_multiplier_; // XDotSlice_K_, K_, TildeSlice_, WTildeSlice_
+    Tuple<index_t, index_t, index_t, index_t>
+        low_lengths_magic_divisor_shift_; // XDotSlice_K_, K_, TildeSlice_, WTildeSlice_
+
+    __host__ __device__ constexpr ConvBwdDataImplicitGemmOutTransform() = default;
+
+    __host__ __device__ constexpr ConvBwdDataImplicitGemmOutTransform(index_t N,
+                                                                      index_t Ho,
+                                                                      index_t Wo,
+                                                                      index_t K,
+                                                                      index_t XDot,
+                                                                      index_t HTilde,
+                                                                      index_t WTilde,
+                                                                      index_t WTildeSlice,
+                                                                      index_t HWTildeSlice,
+                                                                      index_t IHTildeSliceBegin,
+                                                                      index_t IWTildeSliceBegin,
+                                                                      index_t HRatio,
+                                                                      index_t WRatio,
+                                                                      index_t XDotSlice_K,
+                                                                      index_t K0,
+                                                                      index_t MPadded,
+                                                                      index_t K1,
+                                                                      index_t MPad,
+                                                                      index_t KPad)
+        : N_{N},
+          Ho_{Ho},
+          Wo_{Wo},
+          K_{K},
+          XDot_{XDot},
+          HTilde_{HTilde},
+          WTilde_{WTilde},
+          WTildeSlice_{WTildeSlice},
+          TildeSlice_{HWTildeSlice},
+          IHTildeSliceBegin_{IHTildeSliceBegin},
+          IWTildeSliceBegin_{IWTildeSliceBegin},
+          HRatio_{HRatio},
+          WRatio_{WRatio},
+          XDotSlice_K_{XDotSlice_K},
+          MPad_{MPad},
+          KPad_{KPad},
+          up_lengths_{make_tuple(K0, MPadded, K1)},
+          low_lengths_magic_divisor_multiplier_{
+              MagicDivision::CalculateMagicMultiplier(XDotSlice_K_),
+              MagicDivision::CalculateMagicMultiplier(K_),
+              MagicDivision::CalculateMagicMultiplier(TildeSlice_),
+              MagicDivision::CalculateMagicMultiplier(WTildeSlice_)},
+          low_lengths_magic_divisor_shift_{MagicDivision::CalculateMagicShift(XDotSlice_K_),
+                                           MagicDivision::CalculateMagicShift(K_),
+                                           MagicDivision::CalculateMagicShift(TildeSlice_),
+                                           MagicDivision::CalculateMagicShift(WTildeSlice_)}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 4; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 3; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr auto CalculateLowerIndexN(const UpIdx& idx_up) const
+    {
+        index_t NStep, HStep, WStep;
+        // Merge
+        // NStep = M_id / TildeSlice_
+        NStep = MagicDivision::DoMagicDivision(idx_up[I1],
+                                               this->low_lengths_magic_divisor_multiplier_[I2],
+                                               this->low_lengths_magic_divisor_shift_[I2]);
+        HStep = idx_up[I1] - NStep * TildeSlice_;
+        // HStep = HStep / WTildeSlice_
+        HStep = MagicDivision::DoMagicDivision(HStep,
+                                               this->low_lengths_magic_divisor_multiplier_[I3],
+                                               this->low_lengths_magic_divisor_shift_[I3]);
+        WStep = idx_up[I1] - NStep * TildeSlice_ - HStep * WTildeSlice_;
+        // Slice
+        HStep += IHTildeSliceBegin_;
+        WStep += IWTildeSliceBegin_;
+
+        return make_tuple(NStep, HStep, WStep, 0);
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr auto CalculateLowerIndexK(const UpIdx& idx_up) const
+    {
+        // UnMerge
+        //  K_idx <- K0_idx * K1 + K1_idx
+        index_t K_idx = idx_up[I0] * up_lengths_[I2] + idx_up[I2];
+        // Merge
+        // YStep = K_idx / XDotSlice_K_
+        index_t YStep =
+            MagicDivision::DoMagicDivision(K_idx,
+                                           this->low_lengths_magic_divisor_multiplier_[I0],
+                                           this->low_lengths_magic_divisor_shift_[I0]);
+        index_t KStep = K_idx - YStep * XDotSlice_K_;
+        // Xstep = KStep / K_
+        index_t XStep =
+            MagicDivision::DoMagicDivision(KStep,
+                                           this->low_lengths_magic_divisor_multiplier_[I1],
+                                           this->low_lengths_magic_divisor_shift_[I1]);
+        KStep -= XStep * K_;
+        // Embed
+        YStep *= HRatio_;
+        XStep *= WRatio_;
+
+        return make_tuple(0, YStep, XStep, KStep);
+    }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        idx_low = CalculateLowerIndexN(idx_up) + CalculateLowerIndexK(idx_up);
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& /* idx_diff_up */,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up,
+                                              Number<Hack>) const
+    {
+        LowIdx low_old = idx_low;
+        idx_low        = CalculateLowerIndexN(idx_up) + CalculateLowerIndexK(idx_up);
+        idx_diff_low   = idx_low - low_old;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        // Padding
+        index_t K_idx  = idx_up[Number<0>{}] * up_lengths_[Number<2>{}] + idx_up[Number<2>{}];
+        index_t& M_idx = idx_up[Number<1>{}];
+
+        bool pad_valid = M_idx < up_lengths_[Number<1>{}] - MPad_ &&
+                         K_idx < up_lengths_[Number<0>{}] * up_lengths_[Number<2>{}] - KPad_;
+        return pad_valid;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime() { return false; }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("ConvBwdDataImplicitGemmOutTransform, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
 template <typename LowerIndex>
 struct Freeze
 {
diff --git a/include/ck/tensor_description/multi_index_transform_helper.hpp b/include/ck/tensor_description/multi_index_transform_helper.hpp
index 8feadf63c6..a6626ae252 100644
--- a/include/ck/tensor_description/multi_index_transform_helper.hpp
+++ b/include/ck/tensor_description/multi_index_transform_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -94,6 +94,59 @@ __host__ __device__ constexpr auto make_unmerge_transform(
     return UnMerge<UpLengths, Use24BitIntegerCalculation>{up_lengths};
 }
 
+__host__ __device__ constexpr auto make_conv_bwd_data_out_transform(index_t N,
+                                                                    index_t Ho,
+                                                                    index_t Wo,
+                                                                    index_t K,
+                                                                    [[maybe_unused]] index_t YDot,
+                                                                    index_t XDot,
+                                                                    index_t HTilde,
+                                                                    index_t WTilde,
+                                                                    index_t ConvDilationH,
+                                                                    index_t ConvDilationW,
+                                                                    index_t HTildeSlice,
+                                                                    index_t WTildeSlice,
+                                                                    index_t YDotSlice,
+                                                                    index_t XDotSlice,
+                                                                    index_t IHTildeSliceBegin,
+                                                                    index_t IWTildeSliceBegin,
+                                                                    index_t GcdStrideDilationH,
+                                                                    index_t GcdStrideDilationW,
+                                                                    index_t K0,
+                                                                    index_t K1,
+                                                                    index_t MPerBlock,
+                                                                    index_t GemmKPerBlock)
+{
+    // Calculate padding
+    const auto MRaw    = N * HTildeSlice * WTildeSlice;
+    const auto MPadded = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+    const auto MPad    = MPadded - MRaw;
+
+    const auto KRaw    = YDotSlice * XDotSlice * K;
+    const auto KPadded = math::integer_divide_ceil(KRaw, GemmKPerBlock) * GemmKPerBlock;
+    const auto KPad    = KPadded - KRaw;
+
+    return ConvBwdDataImplicitGemmOutTransform{N,
+                                               Ho,
+                                               Wo,
+                                               K,
+                                               XDot,
+                                               HTilde,
+                                               WTilde,
+                                               WTildeSlice,
+                                               HTildeSlice * WTildeSlice,
+                                               IHTildeSliceBegin,
+                                               IWTildeSliceBegin,
+                                               -ConvDilationH / GcdStrideDilationH,
+                                               -ConvDilationW / GcdStrideDilationW,
+                                               XDotSlice * K,
+                                               K0,
+                                               MPadded,
+                                               K1,
+                                               MPad,
+                                               KPad};
+}
+
 template <typename LowerIndex>
 __host__ __device__ constexpr auto make_freeze_transform(const LowerIndex& low_idx)
 {
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
index f03427a7ea..5012e53d33 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -38,13 +38,15 @@ struct BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    static constexpr index_t WaveSize = get_warp_size();
-
     static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1);
     static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1);
     static constexpr index_t KPerBlock =
         BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2);
 
+    static constexpr index_t MWaves   = MPerBlock / (MRepeat * MPerDpp);
+    static constexpr index_t NWaves   = NPerBlock / (NRepeat * NPerDpp);
+    static constexpr index_t WaveSize = BlockSize / MWaves / NWaves;
+
     static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0);
     static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0);
     static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
@@ -54,9 +56,6 @@ struct BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2
 
     static constexpr index_t KPerThread = KPerBlock / dpp_gemm.K0PerDpp;
 
-    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerDpp);
-    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerDpp);
-
     StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                               AccDataType,
                               MRepeat * NRepeat,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
index d0a594e2c6..6b38c340bc 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
@@ -46,7 +46,9 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
     // Hardcode to 64, as HIP-provided "WarpSize" would return 32 on RDNA GPUs.
-    static constexpr index_t WaveSize = 64;
+    static constexpr index_t MWaves   = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves   = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t WaveSize = BlockSize / MWaves / NWaves;
 
     static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
     static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
@@ -77,9 +79,6 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
     static constexpr index_t KRepeat       = KPerThread / KPack;
     static constexpr index_t KPerInnerLoop = KPack;
 
-    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
-    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
-
     // Hardcode to 2, for better 8-bit access pattern
 
     static constexpr index_t MXdlPack = 2;
@@ -206,6 +205,7 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
                                          Tuple5 b_origin = CalculateBThreadOriginDataIndex())
         : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
     {
+#if defined(__HIP_DEVICE_COMPILE__)
         static_assert(AMmaTileDesc::IsKnownAtCompileTime() && BMmaTileDesc::IsKnownAtCompileTime(),
                       "wrong! Desc should be known at compile-time");
         static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
@@ -213,6 +213,7 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
 
         static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
                       "wrong!");
+#endif
     }
 
     // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
index bfb081330c..8cff087ddb 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
@@ -27,7 +27,8 @@ template <BlockGemmPipelineVersion BlkGemmPipelineVer,
           index_t NPerWmma,
           index_t MRepeat,
           index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          bool TransposeC = false>
 constexpr auto BlockGemmPipeline_Selector()
 {
     if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
@@ -50,7 +51,8 @@ constexpr auto BlockGemmPipeline_Selector()
                                                 NPerWmma,
                                                 MRepeat,
                                                 NRepeat,
-                                                KPack>{};
+                                                KPack,
+                                                TransposeC>{};
     }
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
@@ -72,7 +74,8 @@ constexpr auto BlockGemmPipeline_Selector()
                                                 NPerWmma,
                                                 MRepeat,
                                                 NRepeat,
-                                                KPack>{};
+                                                KPack,
+                                                TransposeC>{};
     }
     else
     {
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
index 6fb62bc677..28c871ae0d 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
@@ -277,6 +277,21 @@ struct BlockwiseGemmWmmaops_pipeline_base
                       "wrong!");
     }
 
+    // transposed WMMA output C' = B' * A'
+    __host__ __device__ static constexpr auto
+    GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs()
+    {
+        constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
+            wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
+
+        constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2];
+
+        return make_naive_tensor_descriptor_packed(
+            //        |MRepeat            |MWave |MSubGroup |NRepeat           |NWave
+            //        |NThreadPerSubGroup |MAccVgprs
+            make_tuple(Number<MRepeat>{}, I1, I1, Number<NRepeat>{}, I1, I1, NAccVgprs));
+    }
+
     __host__ __device__ static constexpr auto
     GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
     {
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
index f25648efa6..76d748eb27 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
@@ -31,7 +31,8 @@ template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
           index_t NPerWmma,
           index_t MRepeat,
           index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          bool TransposeC = false>
 struct BlockwiseGemmWmmaops_pipeline_v1
 {
 };
@@ -53,7 +54,8 @@ template <index_t BlockSize,
           index_t NPerWmma,
           index_t MRepeat,
           index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          bool TransposeC>
 struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                         BlockSize,
                                         ADataType,
@@ -72,7 +74,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                         NPerWmma,
                                         MRepeat,
                                         NRepeat,
-                                        KPack>
+                                        KPack,
+                                        TransposeC>
     : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
                                          ADataType,
                                          BDataType,
@@ -90,8 +93,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                          NPerWmma,
                                          MRepeat,
                                          NRepeat,
-                                         KPack>
-
+                                         KPack,
+                                         TransposeC>
 {
     using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
                                                     ADataType,
@@ -110,7 +113,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                                     NPerWmma,
                                                     MRepeat,
                                                     NRepeat,
-                                                    KPack>;
+                                                    KPack,
+                                                    TransposeC>;
     using Base::I0;
 
     using Base::A_K1;
@@ -329,7 +333,8 @@ template <index_t BlockSize,
           index_t NPerWmma,
           index_t MRepeat,
           index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          bool TransposeC>
 struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                         BlockSize,
                                         ADataType,
@@ -348,7 +353,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                         NPerWmma,
                                         MRepeat,
                                         NRepeat,
-                                        KPack>
+                                        KPack,
+                                        TransposeC>
     : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
                                          ADataType,
                                          BDataType,
@@ -366,8 +372,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                          NPerWmma,
                                          MRepeat,
                                          NRepeat,
-                                         KPack>
-
+                                         KPack,
+                                         TransposeC>
 {
     using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
                                                     ADataType,
@@ -386,7 +392,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                                     NPerWmma,
                                                     MRepeat,
                                                     NRepeat,
-                                                    KPack>;
+                                                    KPack,
+                                                    TransposeC>;
     using Base::I0;
     using Base::I1;
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
index 8fed23d151..83dadb2175 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
@@ -31,7 +31,8 @@ template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
           index_t NPerWmma,
           index_t MRepeat,
           index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          bool TransposeC = false>
 struct BlockwiseGemmWmmaops_pipeline_v3
 {
 };
@@ -53,7 +54,8 @@ template <index_t BlockSize,
           index_t NPerWmma,
           index_t MRepeat,
           index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          bool TransposeC>
 struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                                         BlockSize,
                                         ADataType,
@@ -72,7 +74,8 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                                         NPerWmma,
                                         MRepeat,
                                         NRepeat,
-                                        KPack>
+                                        KPack,
+                                        TransposeC>
     : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
                                          ADataType,
                                          BDataType,
@@ -90,7 +93,8 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                                          NPerWmma,
                                          MRepeat,
                                          NRepeat,
-                                         KPack>
+                                         KPack,
+                                         TransposeC>
 {
     using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
                                                     ADataType,
@@ -109,7 +113,8 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                                                     NPerWmma,
                                                     MRepeat,
                                                     NRepeat,
-                                                    KPack>;
+                                                    KPack,
+                                                    TransposeC>;
     using Base::I0;
 
     using Base::A_K1;
@@ -128,6 +133,8 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
     using Base::GetCThreadBuffer;
     using Base::
         GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
+    using Base::
+        GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs;
 
     using Base::a_block_desc_k0_m0_m1_m2_k1;
     using Base::b_block_desc_k0_n0_n1_n2_k1;
@@ -145,8 +152,21 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
 
     __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
     {
-        ignore = num_loop;
-        return TailNumber::Full;
+        if(BlockHasHotloop(num_loop))
+        {
+            return TailNumber::Full;
+        }
+        else
+        {
+            if(num_loop == 1)
+            {
+                return TailNumber::Odd;
+            }
+            else
+            {
+                return TailNumber::Even;
+            }
+        }
     }
 
     __device__ static constexpr auto HotLoopScheduler()
@@ -362,12 +382,15 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
         a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
         b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
 
-        // Global prefetch 2
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+        // Global prefetch 2, perform when at least 2 loops exist.
+        if constexpr(TailNum == TailNumber::Even || TailNum == TailNumber::Full)
+        {
+            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
 
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        }
 
         // Initialize C
         c_thread_buf.Clear();
@@ -379,7 +402,7 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
 
         __builtin_amdgcn_sched_barrier(0);
 
-        // main body
+        // Main body, perform when at least 3 loops exist.
         if constexpr(HasMainLoop)
         {
             index_t i = 0;
@@ -448,10 +471,62 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                 __builtin_amdgcn_sched_barrier(0);
 
                 i += 1;
-            } while(i < (num_loop - 1));
+            } while(i < (num_loop - 2));
         }
-        // tail
-        if constexpr(TailNum == TailNumber::Full)
+
+        // Pre-tail, perform when at least 2 loops exist.
+        if constexpr(TailNum == TailNumber::Even || TailNum == TailNumber::Full)
+        {
+            block_sync_lds();
+
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+            // No RunRead or MoveSrcSliceWindow here, already finished them all!
+
+            b_scale_struct.template GlobalLoad<0>(num_loop % num_loop_per_scale == 0);
+
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+
+                        static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    Number<ik / A_K1>{}, m0, k0, I0, I0, Number<ik % A_K1>{}))>{}];
+                        });
+                        static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(make_tuple(
+                                    Number<ik / B_K1>{}, n0, k0, I0, I0, Number<ik % B_K1>{}))>{}];
+                        });
+
+                        using wmma_input_type_a =
+                            typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                        using wmma_input_type_b =
+                            typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
+
+                        wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                      b_thread_vec.template AsType<wmma_input_type_b>(),
+                                      c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            LocalLoad(a_block_buf, a_thread_buf, b_block_buf, b_thread_buf, b_scale_struct);
+
+            HotLoopScheduler();
+            __builtin_amdgcn_sched_barrier(0);
+        }
+
+        // Tail, always perform.
         {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
index 231dbf817c..b729271680 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -32,9 +32,9 @@ template <index_t BlockSize,
           index_t KPerXDL>
 struct BlockwiseGemmXdlops_pipeline_hotloop_inst
 {
-    static constexpr index_t WaveSize = 64;
     static constexpr index_t WaveNumM = MPerBlock / (MRepeat * MPerXDL);
     static constexpr index_t WaveNumN = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t WaveSize = BlockSize / (WaveNumM * WaveNumN);
 
     static constexpr index_t A_Buffer_Load_Inst_Num =
         MPerBlock * KPerBlock / (BlockSize * ABufferLoadWidth);
@@ -108,7 +108,9 @@ struct BlockwiseGemmXdlops_pipeline_v4
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    static constexpr index_t WaveSize = get_warp_size();
+    static constexpr index_t MWaves   = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves   = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t WaveSize = BlockSize / MWaves / NWaves;
 
     static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
     static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
@@ -121,9 +123,6 @@ struct BlockwiseGemmXdlops_pipeline_v4
     static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
     static constexpr index_t KRepeat    = KPerThread / KPack;
 
-    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
-    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
-
     using HotLoopInstList = BlockwiseGemmXdlops_pipeline_hotloop_inst<BlockSize,
                                                                       MPerBlock,
                                                                       NPerBlock,
@@ -237,6 +236,7 @@ struct BlockwiseGemmXdlops_pipeline_v4
                                     Tuple4 b_origin = CalculateBThreadOriginDataIndex())
         : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
     {
+#if defined(__HIP_DEVICE_COMPILE__)
         static_assert(AMmaTileDesc::IsKnownAtCompileTime() && BMmaTileDesc::IsKnownAtCompileTime(),
                       "wrong! Desc should be known at compile-time");
 
@@ -245,7 +245,7 @@ struct BlockwiseGemmXdlops_pipeline_v4
 
         static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
                       "wrong!");
-
+#endif
         // HotLoopInstList::Print();
     }
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp
index 7a565fbaa7..8d22caa6cb 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp
@@ -141,6 +141,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1<BlockGemmPipelineSch
 
     using Base::AMmaKStride;
     using Base::BMmaKStride;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
@@ -153,7 +154,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1<BlockGemmPipelineSch
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
index 8b227a8aa1..b0a583030e 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
@@ -143,6 +143,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineSch
     using Base::BMmaKStride;
 
     using Base::MWaves;
+    using Base::WaveSize;
 
     static constexpr auto xdlops_gemm =
         XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, ComputeDataType>{};
@@ -159,7 +160,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineSch
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
index 29750b8baa..fc7360dee5 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
@@ -142,6 +142,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
     using Base::AMmaKStride;
     using Base::BMmaKStride;
     using Base::c_thread_desc_;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
@@ -154,7 +155,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
index fe89e700c4..68cde2b880 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
@@ -144,6 +144,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineSch
     using Base::BMmaKStride;
     using Base::c_thread_desc_;
     using Base::MWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
@@ -156,7 +157,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineSch
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
index c76be74e52..fa67750e87 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
@@ -145,6 +145,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
     using Base::BMmaKStride;
 
     using Base::MWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages        = 2;
     static constexpr index_t PrefillStages         = 1;
@@ -158,7 +159,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
index d8f11572a8..437c73fa97 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -143,6 +143,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
     using Base::AMmaKStride;
     using Base::BMmaKStride;
     using Base::MWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
@@ -155,7 +156,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
index 601756be44..03f50fe17a 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -142,6 +142,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
 
     using Base::AMmaKStride;
     using Base::BMmaKStride;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages  = 3;
     static constexpr index_t PrefillStages   = 2;
@@ -154,7 +155,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
index 6f0404a1ca..3d725fc5fa 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -151,6 +151,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
     using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages        = 2;
     static constexpr index_t PrefillStages         = 1;
@@ -164,7 +165,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
index acd1d2ae49..d664a822aa 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -54,6 +54,9 @@ struct BlockwiseGemmXdlops_pipeline_base
     static constexpr auto xdlops_gemm =
         XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, ComputeDataType, TransposeC>{};
 
+    using ComputeDataTypeBuf =
+        conditional_t<std::is_same<ComputeDataType, ck::tf32_t>::value, float, ComputeDataType>;
+
     static constexpr index_t AMmaKStride = KPack;
     static constexpr index_t BMmaKStride = KPack;
 
@@ -93,8 +96,10 @@ struct BlockwiseGemmXdlops_pipeline_base
                                                       NPerXDL,
                                                       xdlops_gemm.KPerXdlops>;
 
+#if defined(__HIP_DEVICE_COMPILE__)
     static_assert(KPerThread % KPack == 0,
                   "Wrong KPack setting; try increasing KPerThread or decreasing KPack");
+#endif
 
     StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                               AccDataType,
@@ -374,7 +379,7 @@ struct BlockwiseGemmXdlops_pipeline_base
         make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
 
     using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(a_block_desc_m0_m1_m2_k),
                                                          decltype(a_thread_desc_),
                                                          Sequence<1, 1, 1, KPack>,
@@ -384,7 +389,7 @@ struct BlockwiseGemmXdlops_pipeline_base
                                                          A_K1>;
 
     using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(b_block_desc_n0_n1_n2_k),
                                                          decltype(b_thread_desc_),
                                                          Sequence<1, 1, 1, KPack>,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp
index 8e2922e2ce..2bd115cf35 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp
@@ -153,6 +153,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1<BlockGemmPipelineS
 
     using Base::MWaves;
     using Base::NWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
@@ -165,7 +166,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1<BlockGemmPipelineS
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp
index cc4c5a2c36..ba5b6e8292 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp
@@ -152,6 +152,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
     using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages        = 2;
     static constexpr index_t LocalPrefetchStages   = 2;
@@ -166,7 +167,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
index 1608506b40..3bec4821fb 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
@@ -153,6 +153,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
 
     using Base::MWaves;
     using Base::NWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
@@ -165,7 +166,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
index 30d6d4f812..7e33016cec 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
@@ -152,6 +152,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3<
     using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages        = 2;
     static constexpr index_t PrefillStages         = 1;
@@ -165,7 +166,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3<
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp
index 598b69cd61..5a21e41c57 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp
@@ -153,6 +153,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1<
 
     using Base::MWaves;
     using Base::NWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
@@ -165,7 +166,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1<
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp
index 6db02d1dd7..99bca30dd9 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp
@@ -152,6 +152,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3<
     using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MWaves;
+    using Base::WaveSize;
 
     static constexpr index_t PrefetchStages        = 2;
     static constexpr index_t PrefillStages         = 1;
@@ -165,7 +166,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3<
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
         constexpr index_t K2 = KPack / KGroup;
-        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K1 = WaveSize / NPerXDL;
         constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
index f597573dc2..f281184c14 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
@@ -140,6 +140,8 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
     using Base::AMmaKStride;
     using Base::BMmaKStride;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t PrefetchStages  = 1;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
@@ -185,9 +187,9 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                         CThreadBuffer& c_thread_buf,
                         index_t num_loop) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -240,20 +242,20 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -301,20 +303,20 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -439,6 +441,8 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
     using Base::a_block_desc_m0_m1_m2_k;
     using Base::b_block_desc_n0_n1_n2_k;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t NumMacClusters  = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS;
     static constexpr index_t KPerInnerLoop   = math::max(KPerThread / NumMacClusters, KPack);
     static constexpr index_t KRepeat         = KPerThread / KPerInnerLoop;
@@ -486,9 +490,9 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                         CThreadBuffer& c_thread_buf,
                         index_t num_loop) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -551,20 +555,20 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                     static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                         static_for<0, MRepeat, 1>{}([&](auto m0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                vector_type<ComputeDataType, KPack> a_thread_vec;
-                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                             make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                             make_tuple(n0, I0, k0, k_ + ik))>{}];
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 constexpr index_t c_offset =
@@ -640,20 +644,20 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                 static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, k_ + ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -704,7 +708,7 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                    I1));
 
     using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(a_block_desc_m0_m1_m2_k),
                                                          decltype(a_thread_desc_),
                                                          Sequence<1, 1, 1, KPerInnerLoop>,
@@ -714,7 +718,7 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                                          A_K1>;
 
     using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(b_block_desc_n0_n1_n2_k),
                                                          decltype(b_thread_desc_),
                                                          Sequence<1, 1, 1, KPerInnerLoop>,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
index ea4f5e4a28..1af982e165 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
@@ -144,6 +144,8 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
     using Base::a_block_desc_m0_m1_m2_k;
     using Base::b_block_desc_n0_n1_n2_k;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t AMmaKStride = xdlops_gemm.K0PerXdlops * KPack;
     static constexpr index_t BMmaKStride = xdlops_gemm.K0PerXdlops * KPack;
 
@@ -222,10 +224,12 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
 
         // stage 1
         // Separate this part?
-        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
-        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
-        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
-        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataTypeBuf) / sizeof(ADataType) >
+        //                                               sizeof(ComputeDataTypeBuf) /
+        //                                               sizeof(BDataType)
+        //                                           ? sizeof(ComputeDataTypeBuf) /
+        //                                           sizeof(ADataType) : sizeof(ComputeDataTypeBuf)
+        //                                           / sizeof(BDataType);
         constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
         constexpr auto num_mfma_per_issue =
             num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
@@ -351,9 +355,9 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
     {
         __builtin_amdgcn_sched_barrier(0);
         // assume kperblock = scaleblockk
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
         auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
             a_scale_thread_desc.GetElementSpaceSize());
@@ -516,17 +520,17 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                     .template AsType<AccDataType>()(Number<t>{}) = 0;
                             });
                             static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
-                                vector_type<ComputeDataType, KPack> a_thread_vec;
-                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                             make_tuple(m0,
                                                        I0,
                                                        kscale0 * KRepeat / num_scale_k_block + k0,
                                                        ik))>{}];
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                             make_tuple(n0,
                                                        I0,
@@ -535,7 +539,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 xdlops_gemm.template Run<>(
@@ -646,17 +650,17 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                 .template AsType<AccDataType>()(Number<t>{}) = 0;
                         });
                         static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0,
                                                    I0,
                                                    kscale0 * KRepeat / num_scale_k_block + k0,
                                                    ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0,
                                                    I0,
@@ -665,7 +669,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             xdlops_gemm.template Run<>(
@@ -737,17 +741,17 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                 .template AsType<AccDataType>()(Number<t>{}) = 0;
                         });
                         static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0,
                                                    I0,
                                                    kscale0 * KRepeat / num_scale_k_block + k0,
                                                    ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0,
                                                    I0,
@@ -756,7 +760,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             xdlops_gemm.template Run<>(
@@ -791,17 +795,17 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                 .template AsType<AccDataType>()(Number<t>{}) = 0;
                         });
                         static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0,
                                                    I0,
                                                    kscale0 * KRepeat / num_scale_k_block + k0,
                                                    ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0,
                                                    I0,
@@ -810,7 +814,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             xdlops_gemm.template Run<>(
@@ -842,7 +846,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
     using Base::b_thread_desc_;
     using Base::c_thread_desc_;
     using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(a_block_desc_m0_m1_m2_k),
                                                          decltype(a_thread_desc_),
                                                          Sequence<1, 1, 1, KPack>,
@@ -852,7 +856,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                                          A_K1>;
 
     using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(b_block_desc_n0_n1_n2_k),
                                                          decltype(b_thread_desc_),
                                                          Sequence<1, 1, 1, KPack>,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp
index 4246f4a44e..123174e090 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp
@@ -140,6 +140,8 @@ struct BlockwiseGemmXdlops_pipeline_v1_b_scale<BlockGemmPipelineScheduler::Intra
     using Base::AMmaKStride;
     using Base::BMmaKStride;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t PrefetchStages  = 1;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
@@ -205,12 +207,12 @@ struct BlockwiseGemmXdlops_pipeline_v1_b_scale<BlockGemmPipelineScheduler::Intra
     {
         // assume kperblock = scaleblockk
         ignore            = num_loop_per_scale;
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
-        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_scale_thread_desc.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -279,20 +281,20 @@ struct BlockwiseGemmXdlops_pipeline_v1_b_scale<BlockGemmPipelineScheduler::Intra
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
                         c_thread_buf_per_scale.Clear();
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             xdlops_gemm.template Run<>(
@@ -360,20 +362,20 @@ struct BlockwiseGemmXdlops_pipeline_v1_b_scale<BlockGemmPipelineScheduler::Intra
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
                     c_thread_buf_per_scale.Clear();
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         xdlops_gemm.template Run<>(
                             a_thread_vec.template AsType<mfma_input_type>(),
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
index 4cc1cf569d..b474ddf528 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
@@ -141,6 +141,8 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
     using Base::BMmaKStride;
     using Base::WaveSize;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t WgpPerCU =
         (4 * WaveSize / BlockSize) >= 1 ? 4 * WaveSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
@@ -225,9 +227,9 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                         CThreadBuffer& c_thread_buf,
                         index_t num_loop) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -284,20 +286,20 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
                         static_for<0, MRepeat, 1>{}([&](auto m0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                vector_type<ComputeDataType, KPack> a_thread_vec;
-                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                             make_tuple(m0, I0, k0, ik))>{}];
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                             make_tuple(n0, I0, k0, ik))>{}];
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 constexpr index_t c_offset =
@@ -355,20 +357,20 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -410,20 +412,20 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -461,20 +463,20 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -628,6 +630,8 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
     using Base::b_block_desc_n0_n1_n2_k;
     using Base::WaveSize;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS;
     static constexpr index_t KPerInnerLoop  = math::max(KPerThread / NumMacClusters, KPack);
     static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
@@ -716,9 +720,9 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                         CThreadBuffer& c_thread_buf,
                         index_t num_loop) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -786,20 +790,20 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                         static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                             static_for<0, MRepeat, 1>{}([&](auto m0) {
                                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                    vector_type<ComputeDataType, KPack> a_thread_vec;
-                                    vector_type<ComputeDataType, KPack> b_thread_vec;
+                                    vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                    vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                     static_for<0, KPack, 1>{}([&](auto ik) {
-                                        a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                             a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                                 make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                        b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                             b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                                 make_tuple(n0, I0, k0, k_ + ik))>{}];
                                     });
 
                                     using mfma_input_type =
-                                        typename vector_type<ComputeDataType,
+                                        typename vector_type<ComputeDataTypeBuf,
                                                              xdlops_gemm.K1PerXdlops>::type;
 
                                     constexpr index_t c_offset =
@@ -885,20 +889,20 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                     static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                         static_for<0, MRepeat, 1>{}([&](auto m0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                vector_type<ComputeDataType, KPack> a_thread_vec;
-                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                             make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                             make_tuple(n0, I0, k0, k_ + ik))>{}];
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 constexpr index_t c_offset =
@@ -961,20 +965,20 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                 static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, k_ + ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -1037,20 +1041,20 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                 static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, k_ + ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -1129,7 +1133,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                    I1));
 
     using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(a_block_desc_m0_m1_m2_k),
                                                          decltype(a_thread_desc_),
                                                          Sequence<1, 1, 1, KPerInnerLoop>,
@@ -1139,7 +1143,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                                          A_K1>;
 
     using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(b_block_desc_n0_n1_n2_k),
                                                          decltype(b_thread_desc_),
                                                          Sequence<1, 1, 1, KPerInnerLoop>,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
index 119f8a3306..70f31246f2 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
@@ -143,6 +143,8 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
     using Base::BMmaKStride;
     using Base::WaveSize;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t WgpPerCU =
         (4 * WaveSize / BlockSize) >= 1 ? 4 * WaveSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
@@ -257,9 +259,9 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
     {
         // assume kperblock = scaleblockk
         ignore            = num_loop_per_scale;
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
         auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
             a_scale_thread_desc.GetElementSpaceSize());
@@ -351,20 +353,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
                             c_thread_buf_per_scale.Clear();
                             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                                vector_type<ComputeDataType, KPack> a_thread_vec;
-                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                             make_tuple(m0, I0, k0, ik))>{}];
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                             make_tuple(n0, I0, k0, ik))>{}];
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 xdlops_gemm.template Run<>(
@@ -457,20 +459,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
                         c_thread_buf_per_scale.Clear();
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             xdlops_gemm.template Run<>(
@@ -547,20 +549,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
                     c_thread_buf_per_scale.Clear();
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         xdlops_gemm.template Run<>(
                             a_thread_vec.template AsType<mfma_input_type>(),
@@ -605,20 +607,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
                     c_thread_buf_per_scale.Clear();
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         xdlops_gemm.template Run<>(
                             a_thread_vec.template AsType<mfma_input_type>(),
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
index 80c65515e8..aded984c1e 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
@@ -141,6 +141,8 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intra
     using Base::BMmaKStride;
     using Base::WaveSize;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t WgpPerCU =
         (4 * WaveSize / BlockSize) >= 1 ? 4 * WaveSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
@@ -225,9 +227,9 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intra
                         CThreadBuffer& c_thread_buf,
                         index_t num_loop) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -285,20 +287,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intra
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
                         static_for<0, MRepeat, 1>{}([&](auto m0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                vector_type<ComputeDataType, KPack> a_thread_vec;
-                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                             make_tuple(m0, I0, k0, ik))>{}];
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                             make_tuple(n0, I0, k0, ik))>{}];
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 constexpr index_t c_offset =
@@ -356,20 +358,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intra
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -411,20 +413,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intra
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -462,20 +464,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intra
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -629,6 +631,8 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
     using Base::b_block_desc_n0_n1_n2_k;
     using Base::WaveSize;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS;
     static constexpr index_t KPerInnerLoop  = math::max(KPerThread / NumMacClusters, KPack);
     static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
@@ -732,12 +736,12 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
     {
         ignore = num_loop_per_scale;
 
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
-        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_scale_thread_desc.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -821,20 +825,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
                         static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                             static_for<0, MRepeat, 1>{}([&](auto m0) {
                                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                    vector_type<ComputeDataType, KPack> a_thread_vec;
-                                    vector_type<ComputeDataType, KPack> b_thread_vec;
+                                    vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                    vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                     static_for<0, KPack, 1>{}([&](auto ik) {
-                                        a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                             a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                                 make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                        b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                             b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                                 make_tuple(n0, I0, k0, k_ + ik))>{}];
                                     });
 
                                     using mfma_input_type =
-                                        typename vector_type<ComputeDataType,
+                                        typename vector_type<ComputeDataTypeBuf,
                                                              xdlops_gemm.K1PerXdlops>::type;
 
                                     constexpr index_t c_offset =
@@ -942,20 +946,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
                     static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                         static_for<0, MRepeat, 1>{}([&](auto m0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                vector_type<ComputeDataType, KPack> a_thread_vec;
-                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                             make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                             make_tuple(n0, I0, k0, k_ + ik))>{}];
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 constexpr index_t c_offset =
@@ -1039,20 +1043,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
                 static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, k_ + ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -1123,20 +1127,20 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
                 static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, k_ + ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, k_ + ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -1223,7 +1227,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
                    I1));
 
     using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(a_block_desc_m0_m1_m2_k),
                                                          decltype(a_thread_desc_),
                                                          Sequence<1, 1, 1, KPerInnerLoop>,
@@ -1233,7 +1237,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
                                                          A_K1>;
 
     using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(b_block_desc_n0_n1_n2_k),
                                                          decltype(b_thread_desc_),
                                                          Sequence<1, 1, 1, KPerInnerLoop>,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
index 7203348418..f797c611a8 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
@@ -142,6 +142,8 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
     using Base::AMmaKStride;
     using Base::BMmaKStride;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
@@ -196,10 +198,10 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
 
         // stage 1
         // Separate this part?
-        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
-        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
-        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
-        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataTypeBuf) / sizeof(ADataType) >
+        //                                           sizeof(ComputeDataTypeBuf) / sizeof(BDataType)
+        //                                       ? sizeof(ComputeDataTypeBuf) / sizeof(ADataType)
+        //                                       : sizeof(ComputeDataTypeBuf) / sizeof(BDataType);
         constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
         constexpr auto num_mfma_per_issue =
             num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
@@ -295,9 +297,9 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                         index_t num_loop) const
     {
         __builtin_amdgcn_sched_barrier(0);
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -364,20 +366,20 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -424,20 +426,20 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
index a7d22066ac..3f4f7ea7e8 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
@@ -143,6 +143,8 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
     using Base::AMmaKStride;
     using Base::BMmaKStride;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
@@ -196,10 +198,10 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
 
         // stage 1
         // Separate this part?
-        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
-        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
-        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
-        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataTypeBuf) / sizeof(ADataType) >
+        //                                           sizeof(ComputeDataTypeBuf) / sizeof(BDataType)
+        //                                       ? sizeof(ComputeDataTypeBuf) / sizeof(ADataType)
+        //                                       : sizeof(ComputeDataTypeBuf) / sizeof(BDataType);
         constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
         constexpr auto num_mfma_per_issue =
             num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
@@ -329,9 +331,9 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
         static_assert(CScaleThreadDesc{}.GetLength(Number<2>{}) == 1,
                       "Pipeline v3 only support scaleblocksliceN=1");
         // assume kperblock = scaleblockk
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
         auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
             a_scale_thread_desc.GetElementSpaceSize());
@@ -476,20 +478,20 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                                 .template AsType<AccDataType>()(Number<t>{}) = 0;
                         });
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             xdlops_gemm.template Run<>(
@@ -578,20 +580,20 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                             .template AsType<AccDataType>()(Number<t>{}) = 0;
                     });
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         xdlops_gemm.template Run<>(
                             a_thread_vec.template AsType<mfma_input_type>(),
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
index 3179a90b7f..35be8b9551 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
@@ -142,6 +142,8 @@ struct BlockwiseGemmXdlops_pipeline_v3_b_scale<BlockGemmPipelineScheduler::Intra
     using Base::AMmaKStride;
     using Base::BMmaKStride;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
@@ -195,10 +197,10 @@ struct BlockwiseGemmXdlops_pipeline_v3_b_scale<BlockGemmPipelineScheduler::Intra
 
         // stage 1
         // Separate this part?
-        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
-        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
-        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
-        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataTypeBuf) / sizeof(ADataType) >
+        //                                           sizeof(ComputeDataTypeBuf) / sizeof(BDataType)
+        //                                       ? sizeof(ComputeDataTypeBuf) / sizeof(ADataType)
+        //                                       : sizeof(ComputeDataTypeBuf) / sizeof(BDataType);
         constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
         constexpr auto num_mfma_per_issue =
             num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
@@ -307,13 +309,13 @@ struct BlockwiseGemmXdlops_pipeline_v3_b_scale<BlockGemmPipelineScheduler::Intra
     {
         __builtin_amdgcn_sched_barrier(0);
 
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         // B scale buffer
-        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_scale_thread_desc.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -429,20 +431,20 @@ struct BlockwiseGemmXdlops_pipeline_v3_b_scale<BlockGemmPipelineScheduler::Intra
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, ik))>{}];
                             });
 
                             using mfma_input_type =
-                                typename vector_type<ComputeDataType,
+                                typename vector_type<ComputeDataTypeBuf,
                                                      xdlops_gemm.K1PerXdlops>::type;
 
                             constexpr index_t c_offset =
@@ -491,20 +493,20 @@ struct BlockwiseGemmXdlops_pipeline_v3_b_scale<BlockGemmPipelineScheduler::Intra
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
index 9835d9325b..c762b3be15 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
@@ -142,6 +142,8 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
     using Base::AMmaKStride;
     using Base::BMmaKStride;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t PrefetchStages  = 3;
     static constexpr index_t PrefillStages   = 2;
     static constexpr index_t GlobalBufferNum = 1;
@@ -264,9 +266,9 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                         CThreadBuffer& c_thread_buf,
                         index_t num_loop) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         StaticallyIndexedArray<decltype(a_thread_buf), Number<2>{}> a_thread_bufs;
@@ -369,22 +371,22 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
                         static_for<0, MRepeat, 1>{}([&](auto m0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                vector_type<ComputeDataType, KPack> a_thread_vec;
-                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_bufs[mfma_reg_buf]
                                                      [Number<a_thread_desc_.CalculateOffset(
                                                          make_tuple(m0, I0, k0, ik))>{}];
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_bufs[mfma_reg_buf]
                                                      [Number<b_thread_desc_.CalculateOffset(
                                                          make_tuple(n0, I0, k0, ik))>{}];
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 constexpr index_t c_offset =
@@ -439,20 +441,20 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -492,20 +494,20 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -524,20 +526,20 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
index f35c7a97cc..3819f572c0 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
@@ -142,6 +142,8 @@ struct BlockwiseGemmXdlops_pipeline_v4_b_scale<BlockGemmPipelineScheduler::Intra
     using Base::AMmaKStride;
     using Base::BMmaKStride;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t PrefetchStages  = 3;
     static constexpr index_t PrefillStages   = 2;
     static constexpr index_t GlobalBufferNum = 1;
@@ -277,13 +279,13 @@ struct BlockwiseGemmXdlops_pipeline_v4_b_scale<BlockGemmPipelineScheduler::Intra
                         index_t num_loop,
                         index_t num_loop_per_scale) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         // B scale buffer
-        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_scale_thread_desc.GetElementSpaceSize());
 
         StaticallyIndexedArray<decltype(a_thread_buf), Number<2>{}> a_thread_bufs;
@@ -478,22 +480,22 @@ struct BlockwiseGemmXdlops_pipeline_v4_b_scale<BlockGemmPipelineScheduler::Intra
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
                         static_for<0, MRepeat, 1>{}([&](auto m0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                vector_type<ComputeDataType, KPack> a_thread_vec;
-                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                                vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_bufs[mfma_reg_buf]
                                                      [Number<a_thread_desc_.CalculateOffset(
                                                          make_tuple(m0, I0, k0, ik))>{}];
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_bufs[mfma_reg_buf]
                                                      [Number<b_thread_desc_.CalculateOffset(
                                                          make_tuple(n0, I0, k0, ik))>{}];
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 constexpr index_t c_offset =
@@ -549,20 +551,20 @@ struct BlockwiseGemmXdlops_pipeline_v4_b_scale<BlockGemmPipelineScheduler::Intra
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -603,20 +605,20 @@ struct BlockwiseGemmXdlops_pipeline_v4_b_scale<BlockGemmPipelineScheduler::Intra
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -635,20 +637,20 @@ struct BlockwiseGemmXdlops_pipeline_v4_b_scale<BlockGemmPipelineScheduler::Intra
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                        vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
index 99934fa74e..d5bc6369dd 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
@@ -144,6 +144,8 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
     using Base::AMmaKStride;
     using Base::BMmaKStride;
 
+    using ComputeDataTypeBuf = typename Base::ComputeDataTypeBuf;
+
     static constexpr index_t PrefetchStages  = 3;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 2;
@@ -346,9 +348,9 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
                         CThreadBuffer& c_thread_buf,
                         index_t num_loop) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
             b_thread_desc_.GetElementSpaceSize());
 
         // Global prefetch 1
@@ -405,8 +407,8 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
             do
             {
                 auto LoopFunc = [&](auto vmem_buf) {
-                    vector_type<ComputeDataType, KPack> a_thread_vec;
-                    vector_type<ComputeDataType, KPack> b_thread_vec;
+                    vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+                    vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
                         if constexpr(k0 == (KRepeat - 1))
@@ -427,18 +429,18 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
                         static_for<0, MRepeat, 1>{}([&](auto m0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                             make_tuple(m0, I0, I0, ik))>{}];
                                 });
                                 static_for<0, KPack, 1>{}([&](auto ik) {
-                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                         b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                             make_tuple(n0, I0, I0, ik))>{}];
                                 });
 
                                 using mfma_input_type =
-                                    typename vector_type<ComputeDataType,
+                                    typename vector_type<ComputeDataTypeBuf,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
                                 constexpr index_t c_offset =
@@ -481,8 +483,8 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
         }
         // tail
         auto ReadWriteCompFunc = [&](auto vmem_buf) {
-            vector_type<ComputeDataType, KPack> a_thread_vec;
-            vector_type<ComputeDataType, KPack> b_thread_vec;
+            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 if constexpr(k0 == (KRepeat - 1))
@@ -497,18 +499,18 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, I0, ik))>{}];
                         });
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, I0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -540,25 +542,25 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
             HotLoopScheduler();
         };
         auto ReadCompFunc = [&]() {
-            vector_type<ComputeDataType, KPack> a_thread_vec;
-            vector_type<ComputeDataType, KPack> b_thread_vec;
+            vector_type<ComputeDataTypeBuf, KPack> a_thread_vec;
+            vector_type<ComputeDataTypeBuf, KPack> b_thread_vec;
 
             static_for<0, KRepeat - 1, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, I0, ik))>{}];
                         });
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, I0, ik))>{}];
                         });
 
                         using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -591,16 +593,16 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
                     static_for<0, KPack, 1>{}([&](auto ik) {
-                        a_thread_vec.template AsType<ComputeDataType>()(ik) = a_thread_buf
+                        a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) = a_thread_buf
                             [Number<a_thread_desc_.CalculateOffset(make_tuple(m0, I0, I0, ik))>{}];
                     });
                     static_for<0, KPack, 1>{}([&](auto ik) {
-                        b_thread_vec.template AsType<ComputeDataType>()(ik) = b_thread_buf
+                        b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) = b_thread_buf
                             [Number<b_thread_desc_.CalculateOffset(make_tuple(n0, I0, I0, ik))>{}];
                     });
 
                     using mfma_input_type =
-                        typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                        typename vector_type<ComputeDataTypeBuf, xdlops_gemm.K1PerXdlops>::type;
 
                     constexpr index_t c_offset =
                         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -637,7 +639,7 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
         make_naive_tensor_descriptor_packed(make_tuple(Number<NRepeat>{}, I1, I1, Number<KPack>{}));
 
     using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(a_block_desc_m0_m1_m2_k),
                                                          decltype(a_thread_desc_),
                                                          Sequence<1, 1, 1, KPack>,
@@ -647,7 +649,7 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
                                                          A_K1>;
 
     using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
-                                                         ComputeDataType,
+                                                         ComputeDataTypeBuf,
                                                          decltype(b_block_desc_n0_n1_n2_k),
                                                          decltype(b_thread_desc_),
                                                          Sequence<1, 1, 1, KPack>,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp
index 90f356987d..55e856b641 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -49,7 +49,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    static constexpr index_t WaveSize = get_warp_size();
+    static constexpr index_t MWaves   = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves   = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t WaveSize = BlockSize / MWaves / NWaves;
 
     static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1);
     static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1);
@@ -66,9 +68,6 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
     static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
 
-    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
-    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
-
     StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                               FloatAcc,
                               MRepeat * NRepeat,
@@ -158,6 +157,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
     __host__ __device__ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1()
     {
+#if defined(__HIP_DEVICE_COMPILE__)
         static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
                           BK0NK1BlockDesc::IsKnownAtCompileTime(),
                       "wrong! Desc should be known at compile-time");
@@ -173,6 +173,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
         static_assert(
             KPack % (16 * sizeof(ComputeTypeA)) == 0,
             "KPack must be divisbile by number of elements processed in single smfmac instruction");
+#endif
     }
 
     __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
index e6bb2d8db3..55015dd30f 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -49,7 +49,10 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    static constexpr index_t WaveSize = get_warp_size();
+    using ElementDataTypeA =
+        conditional_t<is_same_v<ComputeTypeA, ck::tf32_t>, float, ComputeTypeA>;
+    using ElementDataTypeB =
+        conditional_t<is_same_v<ComputeTypeB, ck::tf32_t>, float, ComputeTypeB>;
 
     static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1);
     static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1);
@@ -61,14 +64,15 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
     static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
     static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);
 
+    static constexpr index_t MWaves   = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves   = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t WaveSize = BlockSize / MWaves / NWaves;
+
     static constexpr auto xdlops_gemm =
-        XdlopsGemm<ComputeTypeA, MPerXDL, NPerXDL, KPack, ComputeTypeB>{};
+        XdlopsGemm<ComputeTypeA, MPerXDL, NPerXDL, KPack, ComputeTypeB, false, false>{};
 
     static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
 
-    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
-    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
-
     StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                               FloatAcc,
                               MRepeat * NRepeat,
@@ -173,6 +177,11 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
         static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
                       "wrong!");
+        if constexpr(is_same_v<ComputeTypeA, ck::tf32_t> || is_same_v<ComputeTypeB, ck::tf32_t>)
+        {
+            static_assert(is_same_v<ComputeTypeA, ComputeTypeB>,
+                          "ComputeTypeA and ComputeTypeB must be same when one of them is tf32");
+        }
     }
 
     __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
@@ -298,9 +307,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                         const BBlockBuffer& b_block_buf,
                         CThreadBuffer& c_thread_buf) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ElementDataTypeA>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ElementDataTypeB>(
             b_thread_desc_.GetElementSpaceSize());
 
         static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -322,20 +331,20 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                    b_thread_buf);
 
                 static_for<0, KPerThread, KPack>{}([&](auto k) {
-                    vector_type<ComputeTypeA, KPack> a_thread_vec;
-                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+                    vector_type<ElementDataTypeA, KPack> a_thread_vec;
+                    vector_type<ElementDataTypeB, KPack> b_thread_vec;
 
                     static_for<0, KPack, 1>{}([&](auto i) {
-                        a_thread_vec.template AsType<ComputeTypeA>()(i) = a_thread_buf
+                        a_thread_vec.template AsType<ElementDataTypeA>()(i) = a_thread_buf
                             [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
-                        b_thread_vec.template AsType<ComputeTypeB>()(i) = b_thread_buf
+                        b_thread_vec.template AsType<ElementDataTypeB>()(i) = b_thread_buf
                             [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
                     });
 
                     using mfma_input_type_a =
-                        typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
+                        typename vector_type<ElementDataTypeA, xdlops_gemm.K1PerXdlops>::type;
                     using mfma_input_type_b =
-                        typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;
+                        typename vector_type<ElementDataTypeB, xdlops_gemm.K1PerXdlops>::type;
 
                     constexpr index_t c_offset =
                         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -362,7 +371,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
         make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
 
     using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
-                                                         ComputeTypeA,
+                                                         ElementDataTypeA,
                                                          decltype(a_block_desc_m0_m1_m2_k),
                                                          decltype(a_thread_desc_),
                                                          Sequence<1, 1, 1, KPerThread>,
@@ -372,7 +381,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                          A_K1>;
 
     using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
-                                                         ComputeTypeB,
+                                                         ElementDataTypeB,
                                                          decltype(b_block_desc_n0_n1_n2_k),
                                                          decltype(b_thread_desc_),
                                                          Sequence<1, 1, 1, KPerThread>,
@@ -446,6 +455,11 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
     using Base::KPerThread;
     using Base::xdlops_gemm;
 
+    using ElementDataTypeA =
+        conditional_t<is_same_v<ComputeTypeA, ck::tf32_t>, float, ComputeTypeA>;
+    using ElementDataTypeB =
+        conditional_t<is_same_v<ComputeTypeB, ck::tf32_t>, float, ComputeTypeB>;
+
     static constexpr index_t KPerInnerLoop = math::max(KPerThread / NumMacClusters, KPack);
 
     // 2-wave optimized blockwise gemm
@@ -454,9 +468,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                         const BBlockBuffer& b_block_buf,
                         CThreadBuffer& c_thread_buf) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ElementDataTypeA>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ElementDataTypeB>(
             b_thread_desc_.GetElementSpaceSize());
 
         static_for<0, KPerThread, KPerInnerLoop>{}([&](auto k) {
@@ -500,22 +514,22 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
             static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeTypeA, KPack> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+                        vector_type<ElementDataTypeA, KPack> a_thread_vec;
+                        vector_type<ElementDataTypeB, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto i) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(i) =
+                            a_thread_vec.template AsType<ElementDataTypeA>()(i) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, 0, 0, k_ + i))>{}];
-                            b_thread_vec.template AsType<ComputeTypeB>()(i) =
+                            b_thread_vec.template AsType<ElementDataTypeB>()(i) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, 0, 0, k_ + i))>{}];
                         });
 
                         using mfma_input_type_a =
-                            typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ElementDataTypeA, xdlops_gemm.K1PerXdlops>::type;
                         using mfma_input_type_b =
-                            typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<ElementDataTypeB, xdlops_gemm.K1PerXdlops>::type;
 
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -564,7 +578,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
         make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));
 
     using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
-                                                         ComputeTypeA,
+                                                         ElementDataTypeA,
                                                          decltype(a_block_desc_m0_m1_m2_k),
                                                          decltype(a_thread_desc_),
                                                          Sequence<1, 1, 1, KPerInnerLoop>,
@@ -574,7 +588,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                          A_K1>;
 
     using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
-                                                         ComputeTypeB,
+                                                         ElementDataTypeB,
                                                          decltype(b_block_desc_n0_n1_n2_k),
                                                          decltype(b_thread_desc_),
                                                          Sequence<1, 1, 1, KPerInnerLoop>,
@@ -623,19 +637,21 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
     }
     else if constexpr(LoopSched == LoopScheduler::Interwave)
     {
-        return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                            FloatA,
-                                                                            FloatB,
-                                                                            FloatAcc,
-                                                                            AK0MK1BlockDesc,
-                                                                            BK0NK1BlockDesc,
-                                                                            MPerXDL,
-                                                                            NPerXDL,
-                                                                            MRepeat,
-                                                                            NRepeat,
-                                                                            KPack,
-                                                                            ComputeTypeA,
-                                                                            ComputeTypeB>{};
+        return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
+            BlockSize,
+            FloatA,
+            FloatB,
+            FloatAcc,
+            AK0MK1BlockDesc,
+            BK0NK1BlockDesc,
+            MPerXDL,
+            NPerXDL,
+            MRepeat,
+            NRepeat,
+            KPack,
+            ComputeTypeA,
+            ComputeTypeB,
+            CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS>{};
     }
 };
 
@@ -679,7 +695,9 @@ struct BlockwiseGemmXdlops_v2
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    static constexpr index_t WaveSize = get_warp_size();
+    static constexpr index_t MWaves   = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves   = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t WaveSize = BlockSize / MWaves / NWaves;
 
     static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
     static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
@@ -691,9 +709,6 @@ struct BlockwiseGemmXdlops_v2
 
     static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
 
-    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
-    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
-
     static_assert(KPerThread % KPack == 0,
                   "Wrong KPack setting; try increasing KPerThread or decreasing KPack");
 
@@ -790,6 +805,7 @@ struct BlockwiseGemmXdlops_v2
                                                Tuple4 b_origin = CalculateBThreadOriginDataIndex())
         : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
     {
+#if defined(__HIP_DEVICE_COMPILE__)
         static_assert(AMmaTileDesc::IsKnownAtCompileTime() && BMmaTileDesc::IsKnownAtCompileTime(),
                       "wrong! Desc should be known at compile-time");
 
@@ -798,6 +814,7 @@ struct BlockwiseGemmXdlops_v2
 
         static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
                       "wrong!");
+#endif
     }
 
     // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
index 84ee096cba..33457f4b0a 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -30,8 +30,6 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
-    static constexpr index_t WaveSize = 64;
-
     static constexpr index_t KPerBlock = K0PerBlock * KPack;
 
     static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0);
@@ -42,8 +40,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1
     static constexpr index_t KPerThread  = KPerBlock / xdlops_gemm.K0PerXdlops;
     static constexpr index_t K0PerThread = K0PerBlock / xdlops_gemm.K0PerXdlops;
 
-    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
-    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t MWaves   = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves   = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t WaveSize = BlockSize / MWaves / NWaves;
 
     StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                               FloatAcc,
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index c946abb77d..e7ce7cbcf5 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -11,6 +11,7 @@
 
 #include "ck/stream_config.hpp"
 #endif
+#include "ck/utility/get_id.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -46,6 +47,151 @@ namespace device {
 #define REGISTER_EXTRA_PRINTING_METHODS GET_OBJECT_NAME_IMLP GET_TEMPLATE_INFO_IMPL
 #endif
 
+template <index_t BlockSize_,
+          index_t MPerBlock_,
+          index_t NPerBlock_,
+          index_t MPerXDL_,
+          index_t NPerXDL_,
+          index_t MXdlPerWave_,
+          bool IsWave64>
+static constexpr auto GetNXdlPerWave2()
+{
+    constexpr index_t Waves  = IsWave64 ? BlockSize_ / 64 : BlockSize_ / 32;
+    constexpr index_t MWaves = MPerBlock_ / (MXdlPerWave_ * MPerXDL_);
+    static_assert(MWaves > 0);
+
+    constexpr index_t NWaves = Waves / MWaves;
+    if constexpr(NWaves == 0)
+    {
+        return 0;
+    }
+    else
+    {
+        if constexpr(NPerBlock_ % (NPerXDL_ * NWaves) == 0)
+        {
+            return NPerBlock_ / (NWaves * NPerXDL_);
+        }
+        else
+        {
+            return 0;
+        }
+    }
+}
+
+#define GET_NXDL_PER_WAVE_IMPL              \
+    template <bool IsWave64>                \
+    static constexpr auto GetNXdlPerWave()  \
+    {                                       \
+        return GetNXdlPerWave2<BlockSize,   \
+                               MPerBlock,   \
+                               NPerBlock,   \
+                               MPerXDL,     \
+                               NPerXDL,     \
+                               MXdlPerWave, \
+                               IsWave64>(); \
+    }
+
+#define INVOKER_RUN_IMPL                                                               \
+    float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) \
+    {                                                                                  \
+        if(get_warp_size() == 64)                                                      \
+        {                                                                              \
+            if constexpr(NXdlPerWave64 > 0)                                            \
+            {                                                                          \
+                return RunImp<GridwiseGemm64>(arg, stream_config);                     \
+            }                                                                          \
+        }                                                                              \
+        else                                                                           \
+        {                                                                              \
+            if constexpr(NXdlPerWave32 > 0)                                            \
+            {                                                                          \
+                return RunImp<GridwiseGemm32>(arg, stream_config);                     \
+            }                                                                          \
+        }                                                                              \
+        return 0;                                                                      \
+    }
+
+#define INVOKER_RUN3_IMPL                                                              \
+    float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) \
+    {                                                                                  \
+        if(get_warp_size() == 64)                                                      \
+        {                                                                              \
+            if constexpr(NXdlPerWave64 > 0)                                            \
+            {                                                                          \
+                return RunImp<GridwiseGemm64>(arg, stream_config);                     \
+            }                                                                          \
+        }                                                                              \
+        else                                                                           \
+        {                                                                              \
+            if constexpr(NXdlPerWave32 > 0)                                            \
+            {                                                                          \
+                return RunImp<GridwiseGemm32>(                                         \
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg),   \
+                    stream_config);                                                    \
+            }                                                                          \
+        }                                                                              \
+        return 0;                                                                      \
+    }
+
+template <index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename CDataType,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+__device__ static bool constexpr IsValidGemmCompilationParameter()
+{
+#if defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(MPerXdl != 16 || NPerXdl != 16)
+    {
+        return false;
+    }
+#endif
+
+#if defined(__gfx11__)
+    constexpr bool SupportMemOp = CGlobalMemoryDataOperation_ == InMemoryDataOperationEnum::Set;
+#else
+    constexpr bool SupportMemOp =
+        sizeof(CDataType) >= 2 || (CGlobalMemoryDataOperation_ == InMemoryDataOperationEnum::Set);
+#endif
+    if constexpr(SupportMemOp == false)
+    {
+        return false;
+    }
+
+    if constexpr(MXdlPerWave > 0 && NXdlPerWave > 0)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+        if constexpr(MWaves > 0 && NWaves > 0)
+        {
+            constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
+            return WaveSize == get_warp_size();
+        }
+    }
+    return false;
+}
+
+#define IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType_)                       \
+    template <InMemoryDataOperationEnum CGlobalMemoryDataOperation_ =         \
+                  InMemoryDataOperationEnum::Set>                             \
+    __device__ static bool constexpr IsValidCompilationParameter()            \
+    {                                                                         \
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter< \
+            BlockSize,                                                        \
+            MPerBlock,                                                        \
+            NPerBlock,                                                        \
+            MPerXdl,                                                          \
+            NPerXdl,                                                          \
+            MXdlPerWave,                                                      \
+            NXdlPerWave,                                                      \
+            CDataType_,                                                       \
+            CGlobalMemoryDataOperation_>();                                   \
+    }
+
 #ifndef CK_CODE_GEN_RTC
 struct BaseArgument
 {
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
index cbb9fadc6d..5de33c90fe 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -55,6 +55,155 @@ struct DeviceGemmMultipleABD : public BaseOperator
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
 
+// GEMM:
+//   input : A0[M, K], B0[K, N],
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleABDSplitK : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        ck::index_t KBatch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+/// @brief Wrapper for backward compatibility that allows to use instances of
+///        DeviceGemmMultipleABDSplitK in contexts where DeviceGemmMultipleABD is expected.
+///
+/// @note  The main area where it can be used is DeviceOperationInstanceFactory::GetInstances().
+///        The only difference between API of DeviceGemmMultipleABD and DeviceGemmMultipleABDSplitK
+///        is that DeviceGemmMultipleABDSplitK::MakeArgumentPointer requires an additional parameter
+///        KBatch which is explicitly passed as 1 by this wrapper.
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleABDSplitKWrapper : public DeviceGemmMultipleABD<AsLayout,
+                                                                         BsLayout,
+                                                                         DsLayout,
+                                                                         ELayout,
+                                                                         AsDataType,
+                                                                         BsDataType,
+                                                                         DsDataType,
+                                                                         EDataType,
+                                                                         AElementwiseOperation,
+                                                                         BElementwiseOperation,
+                                                                         CDEElementwiseOperation>
+{
+
+    using DeviceOp = DeviceGemmMultipleABDSplitK<AsLayout,
+                                                 BsLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 AElementwiseOperation,
+                                                 BElementwiseOperation,
+                                                 CDEElementwiseOperation>;
+
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+#ifndef __HIPCC_RTC__
+
+    explicit DeviceGemmMultipleABDSplitKWrapper(std::unique_ptr<DeviceOp> p_op)
+        : p_op_(std::move(p_op))
+    {
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return p_op_->IsSupportedArgument(p_arg);
+    }
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return p_op_->MakeArgumentPointer(p_as,
+                                          p_bs,
+                                          p_ds,
+                                          p_e,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideAs,
+                                          StrideBs,
+                                          StrideDs,
+                                          StrideE,
+                                          1, // KBatch
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return p_op_->MakeInvokerPointer();
+    }
+
+    std::string GetTypeString() const override { return p_op_->GetTypeString(); }
+
+    private:
+    std::unique_ptr<DeviceOp> p_op_;
+
+#endif // __HIPCC_RTC__
+};
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
index 6ebdbc5054..6769ba347e 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
@@ -195,6 +195,107 @@ struct DeviceMoEGemmMXBPreShuffle : public BaseOperator
 #endif
 };
 
+/// @brief Wrapper for backward compatibility that allows to use instances of
+///        DeviceGemmMultipleDSplitK in contexts where DeviceGemmMultipleD is expected.
+///
+/// @note  The main area where it can be used is DeviceOperationInstanceFactory::GetInstances().
+///        The only difference between API of DeviceGemmMultipleD and DeviceGemmMultipleDSplitK is
+///        that DeviceGemmMultipleDSplitK::MakeArgumentPointer requires an additional parameter
+///        KBatch which is explicitly passed as 1 by this wrapper.
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleDSplitKWrapper : public DeviceGemmMultipleD<ALayout,
+                                                                     BLayout,
+                                                                     DsLayout,
+                                                                     ELayout,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     DsDataType,
+                                                                     EDataType,
+                                                                     AElementwiseOperation,
+                                                                     BElementwiseOperation,
+                                                                     CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               DsLayout,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               DsDataType,
+                                               EDataType,
+                                               AElementwiseOperation,
+                                               BElementwiseOperation,
+                                               CDEElementwiseOperation>;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+#ifndef __HIPCC_RTC__
+
+    explicit DeviceGemmMultipleDSplitKWrapper(std::unique_ptr<DeviceOp> p_op)
+        : p_op_(std::move(p_op))
+    {
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return p_op_->IsSupportedArgument(p_arg);
+    }
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return p_op_->MakeArgumentPointer(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideE,
+                                          1, // KBatch
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return p_op_->MakeInvokerPointer();
+    }
+
+    std::string GetTypeString() const override { return p_op_->GetTypeString(); }
+
+    private:
+    std::unique_ptr<DeviceOp> p_op_;
+
+#endif // __HIPCC_RTC__
+};
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
index 7296e4faaa..18223c78f7 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -11,6 +11,8 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
+#define DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS 1
+
 template <ck::index_t NDimSpatial,
           typename InLayout,
           typename WeiLayout,
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight_multiple_d.hpp
index 6febf702f9..e4eeb8884b 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight_multiple_d.hpp
@@ -11,6 +11,8 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
+#define DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS 1
+
 template <ck::index_t NDimSpatial,
           typename InLayout,
           typename WeiLayout,
diff --git a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index c71153768d..157e475267 100644
--- a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -94,79 +94,83 @@ __device__ void device_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
     const Block2ETileMap block_2_ctile_map,
     const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if defined(__gfx9__)
-    // offset base pointer for each work-group
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-
-    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
-    const auto& ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
-
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    DsPointer p_ds_grid_grp;
-
-    static constexpr index_t NumDTensor =
-        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
-
-    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
-
-    if constexpr(isMultiA || isMultiB)
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
     {
-        AsPointer p_as_grid_grp;
-        BsPointer p_bs_grid_grp;
+        // offset base pointer for each work-group
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-        const auto& as_batch_offset = compute_ptr_offset_of_batch.GetAsPtrOffset(g_idx);
+        const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+        const auto& ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
-        static constexpr index_t NumATensor = AGridDesc_AK0_M_AK1::Size();
-        static_for<0, NumATensor, 1>{}(
-            [&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_batch_offset[i]; });
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-        const auto& bs_batch_offset = compute_ptr_offset_of_batch.GetBsPtrOffset(g_idx);
+        DsPointer p_ds_grid_grp;
 
-        static constexpr index_t NumBTensor = BGridDesc_BK0_N_BK1::Size();
-        static_for<0, NumBTensor, 1>{}(
-            [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_batch_offset[i]; });
+        static constexpr index_t NumDTensor =
+            DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
 
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
-            p_as_grid_grp,
-            p_bs_grid_grp,
-            p_ds_grid_grp,
-            p_e_grid + e_batch_offset,
-            p_shared,
-            a_element_op,
-            b_element_op,
-            cde_element_op,
-            a_grid_desc_k0_m_k1,
-            b_grid_desc_k0_n_k1,
-            ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            block_2_ctile_map);
-    }
-    else
-    {
-        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-            p_as_grid + a_batch_offset,
-            p_bs_grid + b_batch_offset,
-            p_ds_grid_grp,
-            p_e_grid + e_batch_offset,
-            p_shared,
-            a_element_op,
-            b_element_op,
-            cde_element_op,
-            a_grid_desc_k0_m_k1,
-            b_grid_desc_k0_n_k1,
-            ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            block_2_ctile_map);
+        if constexpr(isMultiA || isMultiB)
+        {
+            AsPointer p_as_grid_grp;
+            BsPointer p_bs_grid_grp;
+
+            const auto& as_batch_offset = compute_ptr_offset_of_batch.GetAsPtrOffset(g_idx);
+
+            static constexpr index_t NumATensor = AGridDesc_AK0_M_AK1::Size();
+            static_for<0, NumATensor, 1>{}(
+                [&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_batch_offset[i]; });
+
+            const auto& bs_batch_offset = compute_ptr_offset_of_batch.GetBsPtrOffset(g_idx);
+
+            static constexpr index_t NumBTensor = BGridDesc_BK0_N_BK1::Size();
+            static_for<0, NumBTensor, 1>{}(
+                [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_batch_offset[i]; });
+
+            GridwiseGemm::template Run<HasMainKBlockLoop>(
+                p_as_grid_grp,
+                p_bs_grid_grp,
+                p_ds_grid_grp,
+                p_e_grid + e_batch_offset,
+                p_shared,
+                a_element_op,
+                b_element_op,
+                cde_element_op,
+                a_grid_desc_k0_m_k1,
+                b_grid_desc_k0_n_k1,
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                block_2_ctile_map);
+        }
+        else
+        {
+            const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+                static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+            const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+                static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+
+            GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+                p_as_grid + a_batch_offset,
+                p_bs_grid + b_batch_offset,
+                p_ds_grid_grp,
+                p_e_grid + e_batch_offset,
+                p_shared,
+                a_element_op,
+                b_element_op,
+                cde_element_op,
+                a_grid_desc_k0_m_k1,
+                b_grid_desc_k0_n_k1,
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                block_2_ctile_map);
+        }
     }
 #else
     ignore = p_as_grid;
@@ -353,6 +357,9 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                                              ComputeDataType>
 {
     using DeviceOp = CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr bool isMultiA = is_detected<is_tuple, ADataType>::value;
     static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value;
@@ -470,10 +477,13 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched
     // Use appropriate gridwise gemm
-    using GridwiseGemm = ck::conditional_t<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = ck::conditional_t<
         isMultiA || isMultiB,
         GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmMultiABDTemplateParameters>,
         GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // If ADataTypes or BDataTypes is tuple, user has to pass ck::Array with pointers.
     using APointers = ck::conditional_t<isMultiA, ck::Array<const void*, NumATensor>&, const void*>;
@@ -481,31 +491,74 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     // Use Tuple for the both cases for GridPointer to initialize it in Argument constructor (not
     // in initializer list what is required for single const pointer).
     using AGridPointer = remove_cvref_t<
-        decltype(GetAGridPointer < isMultiA || isMultiB, GridwiseGemm, ADataType > ())>;
+        decltype(GetAGridPointer < isMultiA || isMultiB, GridwiseGemm64, ADataType > ())>;
     using BGridPointer = remove_cvref_t<
-        decltype(GetBGridPointer < isMultiA || isMultiB, GridwiseGemm, BDataType > ())>;
+        decltype(GetBGridPointer < isMultiA || isMultiB, GridwiseGemm64, BDataType > ())>;
 
     // desc for blockwise copy
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     // block-to-e-tile map
     using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
     // Argument
     struct Argument
     {
+        template <typename GridwiseGemm>
+        __host__ __device__ void init_ds_e_grid_desc()
+        {
+            if constexpr(isMultiA || isMultiB)
+            {
+                const auto as_grid_desc_ak0_m_ak1 =
+                    generate_tuple([&](auto) { return a_grid_desc_m_k_; }, Number<NumATensor>{});
+                const auto bs_grid_desc_bk0_n_bk1 =
+                    generate_tuple([&](auto) { return b_grid_desc_n_k_; }, Number<NumBTensor>{});
+
+                if(GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1,
+                                               bs_grid_desc_bk0_n_bk1,
+                                               ds_grid_desc_m_n_,
+                                               e_grid_desc_m_n_,
+                                               block_2_etile_map_))
+                {
+                    e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            e_grid_desc_m_n_);
+
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            ds_grid_desc_m_n_);
+                }
+            }
+            else
+            {
+                if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                               b_grid_desc_n_k_,
+                                               ds_grid_desc_m_n_,
+                                               e_grid_desc_m_n_,
+                                               block_2_etile_map_))
+                {
+                    e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            e_grid_desc_m_n_);
+
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            ds_grid_desc_m_n_);
+                }
+            }
+        }
         __device__ __host__ Argument(
             APointers p_as,
             BPointers p_bs,
@@ -549,12 +602,12 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
               e_grid_desc_m_n_{
                   DeviceOp::MakeEGridDescriptor_M_N<ELayout>(conv_to_gemm_transformer_)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
               ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
               e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               compute_ptr_offset_of_batch_{},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
@@ -655,43 +708,18 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0];
 
             // populate desc for Ds/E
-            if constexpr(isMultiA || isMultiB)
+            if(get_warp_size() == 64)
             {
-                const auto as_grid_desc_ak0_m_ak1 =
-                    generate_tuple([&](auto) { return a_grid_desc_m_k_; }, Number<NumATensor>{});
-                const auto bs_grid_desc_bk0_n_bk1 =
-                    generate_tuple([&](auto) { return b_grid_desc_n_k_; }, Number<NumBTensor>{});
-
-                if(GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1,
-                                               bs_grid_desc_bk0_n_bk1,
-                                               ds_grid_desc_m_n_,
-                                               e_grid_desc_m_n_,
-                                               block_2_etile_map_))
+                if constexpr(NXdlPerWave64 > 0)
                 {
-                    e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            e_grid_desc_m_n_);
-
-                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            ds_grid_desc_m_n_);
+                    init_ds_e_grid_desc<GridwiseGemm64>();
                 }
             }
             else
             {
-                if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                               b_grid_desc_n_k_,
-                                               ds_grid_desc_m_n_,
-                                               e_grid_desc_m_n_,
-                                               block_2_etile_map_))
+                if constexpr(NXdlPerWave32 > 0)
                 {
-                    e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            e_grid_desc_m_n_);
-
-                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            ds_grid_desc_m_n_);
+                    init_ds_e_grid_desc<GridwiseGemm32>();
                 }
             }
         }
@@ -700,7 +728,7 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         // pointers (tuple if multi AB, pointer if no)
         AGridPointer p_as_grid_;
         BGridPointer p_bs_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptors for problem definiton
@@ -746,7 +774,31 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         ck::Array<index_t, NDimSpatial> input_left_pads_;
         ck::Array<index_t, NDimSpatial> input_right_pads_;
     };
-
+    template <typename GridwiseGemm>
+    static __device__ __host__ bool check_gemm_validity(const Argument& arg)
+    {
+        if constexpr(isMultiA || isMultiB)
+        {
+            // Genarate tuples with the same descriptors
+            const auto as_grid_desc_ak0_m_ak1 =
+                generate_tuple([&](auto) { return arg.a_grid_desc_m_k_; }, Number<NumATensor>{});
+            const auto bs_grid_desc_bk0_n_bk1 =
+                generate_tuple([&](auto) { return arg.b_grid_desc_n_k_; }, Number<NumBTensor>{});
+            return GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1,
+                                               bs_grid_desc_bk0_n_bk1,
+                                               arg.ds_grid_desc_m_n_,
+                                               arg.e_grid_desc_m_n_,
+                                               arg.block_2_etile_map_);
+        }
+        else
+        {
+            return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                               arg.b_grid_desc_n_k_,
+                                               arg.ds_grid_desc_m_n_,
+                                               arg.e_grid_desc_m_n_,
+                                               arg.block_2_etile_map_);
+        }
+    }
     static __device__ __host__ bool IsSupportedArgument(const Argument& arg)
     {
         namespace ctc = tensor_layout::convolution;
@@ -898,27 +950,21 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         }
 
         // check Gridwise GEMM
-        if constexpr(isMultiA || isMultiB)
+        if(get_warp_size() == 64)
         {
-            // Genarate tuples with the same descriptors
-            const auto as_grid_desc_ak0_m_ak1 =
-                generate_tuple([&](auto) { return arg.a_grid_desc_m_k_; }, Number<NumATensor>{});
-            const auto bs_grid_desc_bk0_n_bk1 =
-                generate_tuple([&](auto) { return arg.b_grid_desc_n_k_; }, Number<NumBTensor>{});
-            return GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1,
-                                               bs_grid_desc_bk0_n_bk1,
-                                               arg.ds_grid_desc_m_n_,
-                                               arg.e_grid_desc_m_n_,
-                                               arg.block_2_etile_map_);
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return check_gemm_validity<GridwiseGemm64>(arg);
+            }
         }
         else
         {
-            return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                               arg.b_grid_desc_n_k_,
-                                               arg.ds_grid_desc_m_n_,
-                                               arg.e_grid_desc_m_n_,
-                                               arg.block_2_etile_map_);
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return check_gemm_validity<GridwiseGemm32>(arg);
+            }
         }
+        return false;
     }
 
     static __device__ __host__ auto MakeArgument(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
index ab3f3856aa..537e6dab28 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include <iostream>
 #include <sstream>
 
+#include "ck/utility/env.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -853,7 +854,10 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
                                       arg.e_grid_desc_m_n_,
                                       arg.block_2_ctile_map_))
         {
-            printf("GridwiseOp: Validity check failure\n");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: Validity check failure\n");
+            }
             return false;
         }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
index f59ea3efde..be09d1b505 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -56,44 +56,48 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const Block2ETileMap block_2_etile_map)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
-    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+        const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
-    FloatDsPointer p_ds_grid_grp;
+        FloatDsPointer p_ds_grid_grp;
 
-    static constexpr index_t NumDTensor =
-        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+        static constexpr index_t NumDTensor =
+            DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
 
-    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-        p_a_grid + a_batch_offset,
-        p_b_grid + b_batch_offset,
-        p_ds_grid_grp,
-        p_e_grid + e_batch_offset,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        cde_element_op,
-        a_grid_desc_ak0_m_ak1,
-        b_grid_desc_bk0_n_bk1,
-        ds_grid_desc_mblock_mperblock_nblock_nperblock,
-        e_grid_desc_mblock_mperblock_nblock_nperblock,
-        block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+            p_a_grid + a_batch_offset,
+            p_b_grid + b_batch_offset,
+            p_ds_grid_grp,
+            p_e_grid + e_batch_offset,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_etile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -214,6 +218,10 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
 {
     using DeviceOp = DeviceBatchedContractionMultipleD_Xdl_CShuffle;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     static constexpr auto I0 = Number<0>{};
@@ -546,7 +554,8 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
     using ComputeDataType = ADataType;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle<
         ADataType,
         BDataType,
         ComputeDataType,
@@ -567,7 +576,7 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -589,28 +598,49 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // desc for blockwise copy
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     // block-to-e-tile map
     using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
     // Argument
     struct Argument : public BaseArgument
     {
+        template <typename GridwiseGemm>
+        void init_ds_e_grid_desc()
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+            }
+        }
+
         Argument(const void* p_a_grid,
                  const void* p_b_grid,
                  std::array<const void*, NumDTensor> p_ds_grid,
@@ -642,12 +672,12 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
               e_grid_desc_g_m_n_{
                   DeviceOp::MakeEGridDescriptor_G_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
               ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
               e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op},
@@ -677,19 +707,19 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
             });
 
             // populate desc for Ds/E
-            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                           b_grid_desc_n_k_,
-                                           ds_grid_desc_m_n_,
-                                           e_grid_desc_m_n_,
-                                           block_2_etile_map_))
+            if(get_warp_size() == 64)
             {
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-
-                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        ds_grid_desc_m_n_);
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    init_ds_e_grid_desc<GridwiseGemm64>();
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    init_ds_e_grid_desc<GridwiseGemm32>();
+                }
             }
 
             // for sanity check of vector memory access
@@ -719,7 +749,7 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptors for problem definiton
@@ -767,7 +797,8 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                             arg.b_grid_desc_n_k_,
@@ -836,6 +867,7 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
                 return launch_kernel(integral_constant<bool, false>{});
             }
         }
+        INVOKER_RUN_IMPL
 
         // polymorphic
         float Run(const BaseArgument* p_arg,
@@ -847,16 +879,35 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
 
-        if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                        arg.b_grid_desc_n_k_,
-                                        arg.ds_grid_desc_m_n_,
-                                        arg.e_grid_desc_m_n_,
-                                        arg.block_2_etile_map_))
+        bool valid = false;
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                valid = GridwiseGemm64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                      arg.b_grid_desc_n_k_,
+                                                      arg.ds_grid_desc_m_n_,
+                                                      arg.e_grid_desc_m_n_,
+                                                      arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                valid = GridwiseGemm32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                      arg.b_grid_desc_n_k_,
+                                                      arg.ds_grid_desc_m_n_,
+                                                      arg.e_grid_desc_m_n_,
+                                                      arg.block_2_etile_map_);
+            }
+        }
+        if(!valid)
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
index 8a8cf54e42..3d232b50bb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 
 #include <iostream>
@@ -74,34 +76,38 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                       const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
                                       const Block2ETileMap block_2_etile_map)
 {
-#if defined(__gfx9__)
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
 
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-        p_a_grid + a_batch_offset,
-        p_b_grid + b_batch_offset,
-        ck::Tuple<>{},
-        p_e_grid + e_batch_offset,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        cde_element_op,
-        a_grid_desc_ak0_m_ak1,
-        b_grid_desc_bk0_n_bk1,
-        ck::Tuple<>{},
-        e_grid_desc_mblock_mperblock_nblock_nperblock,
-        block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+            p_a_grid + a_batch_offset,
+            p_b_grid + b_batch_offset,
+            ck::Tuple<>{},
+            p_e_grid + e_batch_offset,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ck::Tuple<>{},
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_etile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -172,6 +178,10 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
 {
     using DeviceOp = DeviceBatchedGemmEPermuteXdl;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -334,7 +344,8 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
     using ComputeDataType = ADataType;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle<
         ADataType,
         BDataType,
         ComputeDataType,
@@ -359,7 +370,7 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -381,6 +392,8 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     using AGridDesc_AK0_M_AK1 =
         remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
@@ -424,10 +437,9 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
                                                     batched_gemm_e_permute_desc.stride_M_,
                                                     batched_gemm_e_permute_desc.stride_N_)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
-              e_grid_desc_mblock_mperblock_nblock_nperblock{},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
               e_grid_desc_g0_g1_m_n_{
                   DeviceOp::MakeEGridDescriptor_G0_G1_M_N(batched_gemm_e_permute_desc.G0_,
                                                           batched_gemm_e_permute_desc.G1_,
@@ -438,21 +450,11 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
                                                           batched_gemm_e_permute_desc.stride_M_,
                                                           batched_gemm_e_permute_desc.stride_N_)},
               compute_ptr_offset_of_batch_{batch_stride_A, batch_stride_B, e_grid_desc_g0_g1_m_n_},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                           b_grid_desc_n_k_,
-                                           ck::Tuple<>{},
-                                           e_grid_desc_m_n_,
-                                           block_2_etile_map_))
-            {
-                e_grid_desc_mblock_mperblock_nblock_nperblock =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-            }
         }
 
         void Print() const
@@ -499,7 +501,9 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                             arg.b_grid_desc_n_k_,
@@ -511,7 +515,9 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
                     "wrong! GridwiseBatchedGemmCPermute_km_kn_m0m1n0n1_xdlops_v2r3 has invalid "
                     "setting");
             }
-
+            auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.e_grid_desc_m_n_);
             const index_t grid_size =
                 arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.BatchCount_;
 
@@ -544,7 +550,7 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
                                               arg.BatchCount_,
                                               arg.a_grid_desc_ak0_m_ak1_,
                                               arg.b_grid_desc_bk0_n_bk1_,
-                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
                                               arg.a_element_op_,
                                               arg.b_element_op_,
                                               arg.cde_element_op_,
@@ -562,6 +568,8 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
             }
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -578,7 +586,7 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp
new file mode 100644
index 0000000000..32669800cd
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp
@@ -0,0 +1,788 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename DeviceOp, typename GridwiseOp, bool HasMainKBlockLoop, TailNumber TailNum>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+    kernel_batched_gemm_gemm_wmma_cshuffle_v3(typename DeviceOp::RawArg arg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+
+    __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()];
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / arg.batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset =
+        __builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+    const long_index_t b0_batch_offset =
+        __builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetB0BasePtr(g_idx)));
+    const long_index_t b1_batch_offset =
+        __builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset =
+        __builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+
+    GridwiseOp::template Run<HasMainKBlockLoop, TailNum>(
+        arg.p_a_grid + a_batch_offset,
+        arg.p_b0_grid + b0_batch_offset,
+        arg.p_b1_grid + b1_batch_offset,
+        arg.p_c_grid + c_batch_offset,
+        p_shared,
+        arg.a_grid_desc,
+        arg.b0_grid_desc,
+        arg.b1_grid_desc,
+        arg.c_grid_desc_mblock_mperblock_nblock_nperblock,
+        arg.a_element_op,
+        arg.b0_element_op,
+        arg.acc_element_op,
+        arg.b1_element_op,
+        arg.c_element_op,
+        arg.block_2_ctile_map);
+#else
+    ignore = arg;
+#endif // (!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__)
+}
+
+// Computes C = A  * B0 * B1
+//         MN = MK * KL * LN
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+template <typename ALayout,
+          typename B0layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t LPerBlock,     // Gemm0NPerBlock
+          ck::index_t KPerBlock,     // Gemm0KPerBlock
+          ck::index_t NPerBlock,     // Gemm1NPerBlock
+          ck::index_t LTilePerBlock, // Gemm1KPerBlock
+          ck::index_t AK1,
+          ck::index_t BK1,
+          ck::index_t L1,       // B1K1
+          ck::index_t MPerWmma, // Gemm0/1 MPerWmma
+          ck::index_t LPerWmma, // Gemm0/1 NPerWmma
+          ck::index_t MRepeat,  // Gemm0/1 MWmmaPerWave or Mrepeat
+          ck::index_t LRepeat,  // Gemm0 NWmmaPerWave or Nrepeat
+          ck::index_t NRepeat,  // Gemm1 NWmmaPerWave or Nrepeat
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename B0BlockTransferThreadClusterLengths_K0_L_K1,
+          typename B0BlockTransferThreadClusterArrangeOrder,
+          typename B0BlockTransferSrcAccessOrder,
+          ck::index_t B0BlockTransferSrcVectorDim,
+          ck::index_t B0BlockTransferSrcScalarPerVector,
+          ck::index_t B0BlockTransferDstScalarPerVector_K1,
+          bool B0BlockLdsAddExtraL,
+          typename B1BlockTransferThreadClusterLengths_L0_N_L1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          ck::index_t B1BlockTransferSrcVectorDim,
+          ck::index_t B1BlockTransferSrcScalarPerVector,
+          ck::index_t B1BlockTransferDstScalarPerVector_L1,
+          bool B1BlockLdsAddExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1>
+struct DeviceBatchedGemmGemm_Wmma_CShuffleV3 : public DeviceBatchedGemmGemm<ALayout,
+                                                                            B0layout,
+                                                                            B1Layout,
+                                                                            CLayout,
+                                                                            ADataType,
+                                                                            B0DataType,
+                                                                            B1DataType,
+                                                                            CDataType,
+                                                                            AElementwiseOperation,
+                                                                            B0ElementwiseOperation,
+                                                                            AccElementwiseOperation,
+                                                                            B1ElementwiseOperation,
+                                                                            CElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmGemm_Wmma_CShuffleV3;
+
+    static constexpr auto I0 = Number<0>{};
+
+    // To match XDL implementation NPerWmma (A.k.a Gemm1 NPerWmma) is set equal
+    // to LPerWmma (A.k.a Gemm0 NPerWmma).
+    static constexpr index_t NPerWmma = LPerWmma;
+
+    // TODO: Now that we are no longer using NumDim or TensorSpec, we can probably use a simpler
+    // Transform operator or just not use one at all.
+    using Transform = TransformBatchedContractionContractionToBatchedGemmGemm_Wmma<
+        Sequence<1, 1, 1, 1, 1>,
+        Sequence<MPerBlock, LPerBlock, KPerBlock, NPerBlock>,
+        GemmSpec,
+        TensorSpecialization::Default,  // ASpec
+        TensorSpecialization::Default,  // B0Spec
+        TensorSpecialization::Default,  // B1Spec
+        TensorSpecialization::Default>; // CSpec
+
+    __host__ __device__ static auto
+    MakeAGridDescriptor(const std::array<index_t, 3>& a_g_m_k_lengths_vec,
+                        const std::array<index_t, 3>& a_g_m_k_strides_vec)
+    {
+        return Transform::MakeAGridDescriptor_AK0_M_AK1(
+            Transform::MakeAGridDescriptor_M_K(a_g_m_k_lengths_vec, a_g_m_k_strides_vec),
+            Number<AK1>{});
+    }
+
+    __host__ __device__ static auto
+    MakeB0GridDescriptor(const std::array<index_t, 3>& b0_g_l_k_lengths_vec,
+                         const std::array<index_t, 3>& b0_g_l_k_strides_vec)
+    {
+        return Transform::MakeB0GridDescriptor_BK0_N_BK1(
+            Transform::MakeB0GridDescriptor_N_K(b0_g_l_k_lengths_vec, b0_g_l_k_strides_vec),
+            Number<BK1>{});
+    }
+
+    __host__ __device__ static auto
+    MakeB1GridDescriptor(const std::array<index_t, 3>& b1_g_n_l_lengths_vec,
+                         const std::array<index_t, 3>& b1_g_n_l_strides_vec)
+    {
+        return Transform::MakeB1GridDescriptor_BK0_N_BK1(
+            Transform::MakeB1GridDescriptor_N_K(b1_g_n_l_lengths_vec, b1_g_n_l_strides_vec),
+            Number<L1>{});
+    }
+
+    using AGridDesc     = decltype(MakeAGridDescriptor({}, {}));
+    using B0GridDesc    = decltype(MakeB0GridDescriptor({}, {}));
+    using B1GridDesc    = decltype(MakeB1GridDescriptor({}, {}));
+    using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {}));
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB0,
+                                     index_t BatchStrideB1,
+                                     index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB0_(BatchStrideB0),
+              BatchStrideB1_(BatchStrideB1),
+              BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetB0BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB0_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB1_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB0_;
+        index_t BatchStrideB1_;
+        index_t BatchStrideC_;
+    };
+
+    // GridwiseOp
+    using GridwiseOp = GridwiseBatchedGemmGemm_wmma_cshuffle_v3<
+        // DataType Family
+        ADataType,
+        B0DataType,
+        AccDataType, // Acc0DataType
+        B1DataType,
+        AccDataType, // Acc1DataType
+        CShuffleDataType,
+        CDataType,
+        // ElementwiseOp Family
+        AElementwiseOperation,
+        B0ElementwiseOperation,
+        AccElementwiseOperation,
+        B1ElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        // InMemory Data Descriptor
+        AGridDesc,
+        B0GridDesc,
+        B1GridDesc,
+        CGridDesc_M_N,
+        // Tiling Family
+        MPerBlock,
+        LPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        NPerBlock,
+        LTilePerBlock,
+        L1,
+        MPerWmma,
+        LPerWmma,
+        NPerWmma,
+        MRepeat,
+        LRepeat,
+        NRepeat,
+        // ThreadCluster Family
+        BlockSize,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        true,
+        ABlockLdsAddExtraM,
+        B0BlockTransferThreadClusterLengths_K0_L_K1,
+        B0BlockTransferThreadClusterArrangeOrder,
+        B0BlockTransferSrcAccessOrder,
+        B0BlockTransferSrcVectorDim,
+        B0BlockTransferSrcScalarPerVector,
+        B0BlockTransferDstScalarPerVector_K1,
+        true,
+        B0BlockLdsAddExtraL,
+        B1BlockTransferThreadClusterLengths_L0_N_L1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_L1,
+        false,
+        B1BlockLdsAddExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        Transform::matrix_padder.PadN,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer>;
+
+    struct RawArg : public BaseArgument
+    {
+        using arr3 = std::array<ck::index_t, 3>;
+
+        RawArg(const ADataType* p_a_grid_,
+               const B0DataType* p_b0_grid_,
+               const B1DataType* p_b1_grid_,
+               CDataType* p_c_grid_,
+               index_t M_,
+               index_t N_,
+               index_t K_,
+               index_t O_,
+               index_t Batch,
+               index_t StrideA,
+               index_t StrideB0,
+               index_t StrideB1,
+               index_t StrideC,
+               index_t BatchStrideA,
+               index_t BatchStrideB0,
+               index_t BatchStrideB1,
+               index_t BatchStrideC,
+               AElementwiseOperation a_element_op_,
+               B0ElementwiseOperation b0_element_op_,
+               AccElementwiseOperation acc_element_op_,
+               B1ElementwiseOperation b1_element_op_,
+               CElementwiseOperation c_element_op_)
+            : p_a_grid{p_a_grid_},
+              p_b0_grid{p_b0_grid_},
+              p_b1_grid{p_b1_grid_},
+              p_c_grid{p_c_grid_},
+              M{M_},
+              N{N_},
+              K{K_},
+              O{O_},
+              batch_count{Batch},
+              a_element_op{a_element_op_},
+              b0_element_op{b0_element_op_},
+              acc_element_op{acc_element_op_},
+              b1_element_op{b1_element_op_},
+              c_element_op{c_element_op_},
+              compute_base_ptr_of_batch{BatchStrideA, BatchStrideB0, BatchStrideB1, BatchStrideC}
+        {
+
+            a_g_m_k_lengths = arr3{batch_count, M, K};
+            a_g_m_k_strides = arr3{BatchStrideA, StrideA, 1}; // A layout [batch_count, M, K]
+
+            b0_g_n_k_lengths = arr3{batch_count, N, K};
+            b0_g_n_k_strides = arr3{BatchStrideB0, StrideB0, 1}; // B0 layout [batch_count, N, K]
+
+            b1_g_o_n_lengths = arr3{batch_count, O, N};
+            b1_g_o_n_strides =
+                is_same_v<B1Layout, tensor_layout::gemm::RowMajor>
+                    ? arr3{BatchStrideB1, 1, StrideB1}  // B1 layout [batch_count, N, O]
+                    : arr3{BatchStrideB1, StrideB1, 1}; // B1 layout [batch_count, O, N]
+
+            c_g_m_o_lengths = arr3{batch_count, M, O};
+            c_g_m_o_strides = arr3{BatchStrideC, StrideC, 1}; // C layout [batch_count, M, O]
+
+            a_grid_desc     = MakeAGridDescriptor(a_g_m_k_lengths, a_g_m_k_strides);
+            b0_grid_desc    = MakeB0GridDescriptor(b0_g_n_k_lengths, b0_g_n_k_strides);
+            b1_grid_desc    = MakeB1GridDescriptor(b1_g_o_n_lengths, b1_g_o_n_strides);
+            c_grid_desc_m_n = Transform::MakeCGridDescriptor_M_N(c_g_m_o_lengths, c_g_m_o_strides);
+            c_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
+            block_2_ctile_map = GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, 1, 1);
+        }
+        // Pointers
+        const ADataType* p_a_grid;
+        const B0DataType* p_b0_grid;
+        const B1DataType* p_b1_grid;
+        CDataType* p_c_grid;
+
+        // Raw Problem Size
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t O;
+        index_t batch_count;
+
+        arr3 a_g_m_k_lengths;
+        arr3 a_g_m_k_strides;
+        arr3 b0_g_n_k_lengths;
+        arr3 b0_g_n_k_strides;
+        arr3 b1_g_o_n_lengths;
+        arr3 b1_g_o_n_strides;
+        arr3 c_g_m_o_lengths;
+        arr3 c_g_m_o_strides;
+
+        AElementwiseOperation a_element_op;
+        B0ElementwiseOperation b0_element_op;
+        AccElementwiseOperation acc_element_op;
+        B1ElementwiseOperation b1_element_op;
+        CElementwiseOperation c_element_op;
+
+        // Grid descriptors and other mem calculators
+        AGridDesc a_grid_desc;
+        B0GridDesc b0_grid_desc;
+        B1GridDesc b1_grid_desc;
+        CGridDesc_M_N c_grid_desc_m_n;
+        typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+        typename GridwiseOp::DefaultBlock2CTileMap block_2_ctile_map;
+
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch;
+    };
+
+    static bool IsSupportedArgument([[maybe_unused]] const RawArg& arg)
+    {
+        // Print lambda with env check and printf() style formmating.
+        const char* curFunc = __func__;
+        auto print          = [&curFunc](const char* format, ...) -> void {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wformat-nonliteral"
+#endif
+                va_list args;
+                va_start(args, format);
+                std::vfprintf(stdout, format, args);
+                va_end(args);
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+                std::cout << "In file: " << __FILE__ << ", function: " << curFunc << "\n";
+            }
+        };
+
+        if(!(ck::is_gfx11_supported() || ck::is_gfx12_supported()))
+        {
+            print("DeviceOp: Arch err\n");
+            return false;
+        }
+
+        if constexpr(std::is_same_v<ADataType, f8_t> || std::is_same_v<ADataType, bf8_t> ||
+                     std::is_same_v<B0DataType, f8_t> || std::is_same_v<B0DataType, bf8_t> ||
+                     std::is_same_v<B1DataType, f8_t> || std::is_same_v<B1DataType, bf8_t>)
+        {
+            if(ck::is_gfx11_supported())
+            {
+                print("DeviceOp: gfx 11 does not support fp8\n");
+                return false;
+            }
+        }
+
+        if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
+        {
+            print("DeviceOp: Acc0 Type err\n");
+            return false;
+        }
+
+        if constexpr(!(is_same_v<ALayout, tensor_layout::gemm::RowMajor>))
+        {
+            print("DeviceOp: A layout must be Row\n");
+            return false;
+        }
+
+        if constexpr(!(is_same_v<B0layout, tensor_layout::gemm::ColumnMajor>))
+        {
+            print("DeviceOp: B layout must be Column\n");
+            return false;
+        }
+
+        if constexpr(!(is_same_v<B1Layout, tensor_layout::gemm::RowMajor> ||
+                       is_same_v<B1Layout, tensor_layout::gemm::ColumnMajor>))
+        {
+            print("DeviceOp: B1 layout must be Column or Row\n");
+            return false;
+        }
+
+        if constexpr(!(is_same_v<CLayout, tensor_layout::gemm::RowMajor>))
+        {
+            print("DeviceOp: C layout must be Row\n");
+            return false;
+        }
+
+        // Other padding modes have not been tested and do not get checked individually.
+        if constexpr(GemmSpec != GemmSpecialization::Default &&
+                     GemmSpec != GemmSpecialization::MNKOPadding)
+        {
+            print("Padding mode must be default or MNKO\n");
+            return false;
+        }
+
+        // Per wmma dimensions not equal to 16 are very untested.
+        if constexpr(MPerWmma != 16 || LPerWmma != 16 || NPerWmma != 16)
+        {
+            print("M, L, N per Wmma must be 16\n");
+            return false;
+        }
+
+        if(!GridwiseOp::CheckValidity(arg.a_grid_desc,
+                                      arg.b0_grid_desc,
+                                      arg.b1_grid_desc,
+                                      arg.c_grid_desc_m_n,
+                                      arg.block_2_ctile_map))
+        {
+            return false;
+        }
+
+        // Check scalar per vector requirement
+        const auto a_extent_lowest  = ABlockTransferSrcVectorDim == 2 ? arg.K : arg.M;
+        const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? arg.K : arg.N;
+        const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? arg.N : arg.O;
+        const auto c_extent_lowest  = arg.O;
+
+        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
+             b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
+             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
+             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+        {
+            print("DeviceOp: Data Transfer Vector scalar err\n");
+            return false;
+        }
+
+        // Check vector load/store requirement
+        const auto a_stride_lowest =
+            ABlockTransferSrcVectorDim == 2 ? arg.a_g_m_k_strides[2] : arg.a_g_m_k_strides[1];
+        const auto b0_stride_lowest =
+            B0BlockTransferSrcVectorDim == 2 ? arg.b0_g_n_k_strides[2] : arg.b0_g_n_k_strides[1];
+        const auto b1_stride_lowest =
+            B1BlockTransferSrcVectorDim == 2 ? arg.b1_g_o_n_strides[2] : arg.b1_g_o_n_strides[1];
+        const auto c_stride_lowest = arg.c_g_m_o_strides[2];
+
+        if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
+             c_stride_lowest == 1))
+        {
+            print("DeviceOp: Data Vectorize transfer err\n");
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MNKOPadding))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const RawArg*>(p_arg));
+    }
+
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::RawArg;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto M0 = math::integer_divide_ceil(arg.M, MPerBlock);
+            const auto N0 = math::integer_divide_ceil(arg.O, NPerBlock);
+
+            const index_t grid_size = arg.batch_count * M0 * N0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop, auto tail_number) {
+                constexpr bool has_loop = decltype(has_main_k_block_loop)::value;
+                constexpr TailNumber tn = tail_number;
+
+                const auto kernel =
+                    kernel_batched_gemm_gemm_wmma_cshuffle_v3<DeviceOp, GridwiseOp, has_loop, tn>;
+
+                return launch_and_time_kernel(
+                    stream_config, kernel, dim3(grid_size), dim3(BlockSize), 0, arg);
+            };
+
+            bool HasMainKBlockLoop = GridwiseOp::CalculateHasMainKBlockLoop(arg.K);
+            TailNumber TailNum     = GridwiseOp::CalculateKBlockLoopTailNum(arg.K);
+
+            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+            {
+                if(HasMainKBlockLoop && TailNum == TailNumber::Full)
+                {
+                    return launch_kernel(std::integral_constant<bool, true>{},
+                                         std::integral_constant<TailNumber, TailNumber::Full>{});
+                }
+                else if(!HasMainKBlockLoop && TailNum == TailNumber::Full)
+                {
+                    return launch_kernel(std::integral_constant<bool, false>{},
+                                         std::integral_constant<TailNumber, TailNumber::Full>{});
+                }
+                else
+                {
+                    printf("Invalid HasMainKBlockLoop and TailNum combination for V1!\n");
+                    return 0.0f;
+                }
+            }
+            else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+            {
+                if(HasMainKBlockLoop && TailNum == TailNumber::Full)
+                {
+                    return launch_kernel(std::integral_constant<bool, true>{},
+                                         std::integral_constant<TailNumber, TailNumber::Full>{});
+                }
+                else if(!HasMainKBlockLoop && TailNum == TailNumber::Even)
+                {
+                    return launch_kernel(std::integral_constant<bool, false>{},
+                                         std::integral_constant<TailNumber, TailNumber::Even>{});
+                }
+                else if(!HasMainKBlockLoop && TailNum == TailNumber::Odd)
+                {
+                    return launch_kernel(std::integral_constant<bool, false>{},
+                                         std::integral_constant<TailNumber, TailNumber::Odd>{});
+                }
+                else
+                {
+                    printf("Invalid HasMainKBlockLoop and TailNum combination for V3!\n");
+                    return 0.0f;
+                }
+            }
+            else
+            {
+                printf("Invalid pipeline version!\n");
+                return 0.0f;
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b0,
+                                                      const void* p_b1,
+                                                      void* p_c,
+                                                      ck::index_t M,
+                                                      ck::index_t N,
+                                                      ck::index_t K,
+                                                      ck::index_t O,
+                                                      ck::index_t Batch,
+                                                      ck::index_t StrideA,
+                                                      ck::index_t StrideB0,
+                                                      ck::index_t StrideB1,
+                                                      ck::index_t StrideC,
+                                                      ck::index_t BatchStrideA,
+                                                      ck::index_t BatchStrideB0,
+                                                      ck::index_t BatchStrideB1,
+                                                      ck::index_t BatchStrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      B0ElementwiseOperation b0_element_op,
+                                                      AccElementwiseOperation acc_element_op,
+                                                      B1ElementwiseOperation b1_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<RawArg>(static_cast<const ADataType*>(p_a),
+                                        static_cast<const B0DataType*>(p_b0),
+                                        static_cast<const B1DataType*>(p_b1),
+                                        static_cast<CDataType*>(p_c),
+                                        M,
+                                        N,
+                                        K,
+                                        O,
+                                        Batch,
+                                        StrideA,
+                                        StrideB0,
+                                        StrideB1,
+                                        StrideC,
+                                        BatchStrideA,
+                                        BatchStrideB0,
+                                        BatchStrideB1,
+                                        BatchStrideC,
+                                        a_element_op,
+                                        b0_element_op,
+                                        acc_element_op,
+                                        b1_element_op,
+                                        c_element_op);
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    template <typename T>
+    static constexpr const char* DataTypeToString()
+    {
+        if constexpr(std::is_same_v<T, float>)
+        {
+            return "fp32";
+        }
+        else if constexpr(std::is_same_v<T, ck::half_t>)
+        {
+            return "fp16";
+        }
+        else if constexpr(std::is_same_v<T, ck::bhalf_t>)
+        {
+            return "bf16";
+        }
+        else if constexpr(std::is_same_v<T, ck::f8_t>)
+        {
+            return "fp8";
+        }
+        else if constexpr(std::is_same_v<T, ck::bf8_t>)
+        {
+            return "bf8";
+        }
+        else if constexpr(std::is_same_v<T, int32_t>)
+        {
+            return "int32";
+        }
+        else if constexpr(std::is_same_v<T, int8_t>)
+        {
+            return "int8";
+        }
+        else if constexpr(std::is_same_v<T, ck::int4_t>)
+        {
+            return "int4";
+        }
+        else
+        {
+            return "unknown";
+        }
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceBatchedGemmGemm_Wmma_CShuffleV3"
+            << "<"
+            << ALayout::name[0]
+            << B0layout::name[0]
+            << B1Layout::name[0]
+            << CLayout::name[0] << ", "
+            << "A " << DataTypeToString<ADataType>() << ", "
+            << "B0 " << DataTypeToString<B0DataType>() << ", "
+            << "B1 " << DataTypeToString<B1DataType>() << ", "
+            << "C " << DataTypeToString<CDataType>() << ", "
+            << "Acc " << DataTypeToString<AccDataType>() << ", "
+            << "Cshuf " << DataTypeToString<CShuffleDataType>() << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << LPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << LTilePerBlock << ", "
+            << L1 << ", "
+            << getGemmSpecializationString(GemmSpec)
+            << ">"
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseOp::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
index b23d864f5c..0ddcd63b2e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -59,36 +59,40 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                      const index_t batch_count,
                                      const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
-    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
-    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+        const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+        const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_b1_grid + b1_batch_offset,
-                                                  p_c_grid + c_batch_offset,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  acc_element_op,
-                                                  b1_element_op,
-                                                  c_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  b1_grid_desc_bk0_n_bk1,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                      p_b_grid + b_batch_offset,
+                                                      p_b1_grid + b1_batch_offset,
+                                                      p_c_grid + c_batch_offset,
+                                                      p_shared,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      acc_element_op,
+                                                      b1_element_op,
+                                                      c_element_op,
+                                                      a_grid_desc_ak0_m_ak1,
+                                                      b_grid_desc_bk0_n_bk1,
+                                                      b1_grid_desc_bk0_n_bk1,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -185,6 +189,10 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
 {
     using DeviceOp = DeviceBatchedGemmGemm_Xdl_CShuffle;
 
+    static constexpr auto MXdlPerWave64 =
+        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
+    static constexpr auto MXdlPerWave32 =
+        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -346,7 +354,8 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
     using CGridDesc_M_N        = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseBatchedGemmGemm_Xdl_CShuffle<
+    template <index_t MXdlPerWave_>
+    using GridwiseGemmBase = GridwiseBatchedGemmGemm_Xdl_CShuffle<
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -373,7 +382,7 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
         B1K1,
         MPerXDL,
         NPerXDL,
-        MXdlPerWave,
+        MXdlPerWave_,
         NXdlPerWave,
         Gemm1NXdlPerWave,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
@@ -405,6 +414,8 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CShuffleBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(MXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<MXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -440,8 +451,7 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
               b1_grid_desc_bk0_n_bk1_{
                   DeviceOp::MakeB1GridDescriptor_BK0_N_BK1(NRaw, Gemm1NRaw, StrideB1)},
               c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, Gemm1NRaw, StrideC)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              block_2_ctile_map_{GridwiseGemm64::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               acc_element_op_{acc_element_op},
@@ -451,16 +461,6 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
               compute_base_ptr_of_batch_{BatchStrideA, BatchStrideB, BatchStrideB1, BatchStrideC},
               raw_lengths_m_n_k_o_{MRaw, NRaw, KRaw, Gemm1NRaw}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           b1_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-            }
         }
 
         void Print() const
@@ -480,9 +480,7 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
         B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
         CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        typename GridwiseGemm64::DefaultBlock2CTileMap block_2_ctile_map_;
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
         AccElementwiseOperation acc_element_op_;
@@ -500,13 +498,16 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!DeviceOp::IsSupportedArgument(arg))
             {
                 throw std::runtime_error("wrong! unsupported argument");
             }
-
+            auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c_grid_desc_m_n_);
             const index_t grid_size =
                 arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_;
 
@@ -551,7 +552,7 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
                                               arg.a_grid_desc_ak0_m_ak1_,
                                               arg.b_grid_desc_bk0_n_bk1_,
                                               arg.b1_grid_desc_bk0_n_bk1_,
-                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              c_grid_desc_mblock_mperblock_nblock_nperblock,
                                               arg.block_2_ctile_map_,
                                               arg.batch_count_,
                                               arg.compute_base_ptr_of_batch_);
@@ -570,7 +571,24 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
 
             return ave_time;
         }
-
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(MXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64>(arg, stream_config);
+                }
+            }
+            else
+            {
+                if constexpr(MXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32>(arg, stream_config);
+                }
+            }
+            return 0;
+        }
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -587,11 +605,10 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         // Note: we need raw lengths since threadwise copy can not handle vector load when part of
         // vector is out of bounds
         const auto MRaw      = arg.raw_lengths_m_n_k_o_[0];
@@ -617,11 +634,29 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.b1_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(MXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.b1_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(MXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.b1_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
index 1f8c6b1508..9aff562744 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -82,45 +82,48 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
                             const Block2ETileMap block_2_etile_map)
 {
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-#if defined(__gfx9__)
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+        const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
-    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        DsPointer p_ds_grid_grp;
 
-    DsPointer p_ds_grid_grp;
+        static constexpr index_t NumDTensor =
+            DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
 
-    static constexpr index_t NumDTensor =
-        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
-
-    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-        p_a_grid + a_batch_offset,
-        p_b_grid + b_batch_offset,
-        p_ds_grid_grp,
-        p_e_grid + e_batch_offset,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        cde_element_op,
-        a_grid_desc_k0_m_k1,
-        b_grid_desc_k0_n_k1,
-        ds_grid_desc_mblock_mperblock_nblock_nperblock,
-        e_grid_desc_mblock_mperblock_nblock_nperblock_,
-        block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+            p_a_grid + a_batch_offset,
+            p_b_grid + b_batch_offset,
+            p_ds_grid_grp,
+            p_e_grid + e_batch_offset,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_k0_m_k1,
+            b_grid_desc_k0_n_k1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            block_2_etile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -197,6 +200,10 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
 {
     using DeviceOp = DeviceBatchedGemmMultiD_Xdl;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     static constexpr auto I0 = Number<0>{};
@@ -326,7 +333,8 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
 
     using ComputeDataType = ADataType;
 
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle<
         ADataType, // TODO: distinguish A/B datatype
         BDataType,
         ComputeDataType,
@@ -347,7 +355,7 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -369,28 +377,48 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // desc for blockwise copy
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     // block-to-e-tile map
     using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
     // Argument
     struct Argument : public BaseArgument
     {
+        template <typename GridwiseGemm>
+        void init_ds_e_grid_desc()
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+            }
+        }
         Argument(const void* p_a_grid,
                  const void* p_b_grid,
                  std::array<const void*, NumDTensor> p_ds_grid,
@@ -420,13 +448,13 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
               ds_grid_desc_m_n_{},
               e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<ELayout>(MRaw, NRaw, StrideE)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
               ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
               e_grid_desc_mblock_mperblock_nblock_nperblock_{},
               compute_ptr_offset_of_batch_{BatchStrideA, BatchStrideB, BatchStrideDs, BatchStrideE},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op}
@@ -445,19 +473,19 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
             });
 
             // populate desc for Ds/E
-            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                           b_grid_desc_n_k_,
-                                           ds_grid_desc_m_n_,
-                                           e_grid_desc_m_n_,
-                                           block_2_etile_map_))
+            if(get_warp_size() == 64)
             {
-                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        ds_grid_desc_m_n_);
-
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    init_ds_e_grid_desc<GridwiseGemm64>();
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    init_ds_e_grid_desc<GridwiseGemm32>();
+                }
             }
         }
 
@@ -474,7 +502,7 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // Batch
@@ -510,7 +538,8 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
     {
         using Argument = DeviceBatchedGemmMultiD_Xdl::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                             arg.b_grid_desc_n_k_,
@@ -577,6 +606,8 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
             }
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -587,16 +618,33 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                           arg.b_grid_desc_n_k_,
-                                           arg.ds_grid_desc_m_n_,
-                                           arg.e_grid_desc_m_n_,
-                                           arg.block_2_etile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
index ea5668d765..bb4af1a8df 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -33,7 +33,7 @@ template <typename GridwiseGemm,
           typename CDE1ElementwiseOperation,
           typename A0GridDesc_AK0_M_AK1,
           typename B0GridDesc_BK0_N_BK1,
-          typename D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5,
+          typename D0sGridDesc_M_N,
           typename B1GridDesc_BK0_N_BK1,
           typename D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
           typename E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -58,8 +58,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CDE1ElementwiseOperation cde1_element_op,
         const A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1,
         const B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1,
-        const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
-            d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+        const D0sGridDesc_M_N d0s_griddesc_m_n,
         const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
         const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
             d1s_grid_desc_mblock_mperblock_nblock_nperblock,
@@ -69,52 +68,57 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const index_t batch_count,
         const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
-    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
-    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+        const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+        const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
 
-    static_for<0, p_d0s_grid.Size(), 1>{}([&](auto In) {
-        const long_index_t d0_batch_offset = __builtin_amdgcn_readfirstlane(
-            static_cast<long_index_t>(compute_base_ptr_of_batch.GetD0BasePtr(g_idx, In)));
-        p_d0s_grid(In) = p_d0s_grid(In) + d0_batch_offset;
-    });
+        static_for<0, p_d0s_grid.Size(), 1>{}([&](auto In) {
+            const long_index_t d0_batch_offset = __builtin_amdgcn_readfirstlane(
+                static_cast<long_index_t>(compute_base_ptr_of_batch.GetD0BasePtr(g_idx, In)));
+            p_d0s_grid(In) = p_d0s_grid(In) + d0_batch_offset;
+        });
 
-    static_for<0, p_d1s_grid.Size(), 1>{}([&](auto In) {
-        const long_index_t d1_batch_offset = __builtin_amdgcn_readfirstlane(
-            static_cast<long_index_t>(compute_base_ptr_of_batch.GetD1BasePtr(g_idx, In)));
-        p_d1s_grid(In) = p_d1s_grid(In) + d1_batch_offset;
-    });
+        static_for<0, p_d1s_grid.Size(), 1>{}([&](auto In) {
+            const long_index_t d1_batch_offset = __builtin_amdgcn_readfirstlane(
+                static_cast<long_index_t>(compute_base_ptr_of_batch.GetD1BasePtr(g_idx, In)));
+            p_d1s_grid(In) = p_d1s_grid(In) + d1_batch_offset;
+        });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a0_grid + a_batch_offset,
-                                                  p_b0_grid + b_batch_offset,
-                                                  p_d0s_grid,
-                                                  p_b1_grid + b1_batch_offset,
-                                                  p_d1s_grid,
-                                                  p_e1_grid + c_batch_offset,
-                                                  p_shared,
-                                                  a0_element_op,
-                                                  b0_element_op,
-                                                  cde0_element_op,
-                                                  b1_element_op,
-                                                  cde1_element_op,
-                                                  a0_grid_desc_ak0_m_ak1,
-                                                  b0_grid_desc_bk0_n_bk1,
-                                                  d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                                                  b1_grid_desc_bk0_n_bk1,
-                                                  d1s_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e1_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_e1tile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a0_grid + a_batch_offset,
+            p_b0_grid + b_batch_offset,
+            p_d0s_grid,
+            p_b1_grid + b1_batch_offset,
+            p_d1s_grid,
+            p_e1_grid + c_batch_offset,
+            p_shared,
+            a0_element_op,
+            b0_element_op,
+            cde0_element_op,
+            b1_element_op,
+            cde1_element_op,
+            a0_grid_desc_ak0_m_ak1,
+            b0_grid_desc_bk0_n_bk1,
+            d0s_griddesc_m_n,
+            b1_grid_desc_bk0_n_bk1,
+            d1s_grid_desc_mblock_mperblock_nblock_nperblock,
+            e1_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_e1tile_map);
+    }
 #else
     ignore = p_a0_grid;
     ignore = p_b0_grid;
@@ -129,7 +133,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
     ignore = cde1_element_op;
     ignore = a0_grid_desc_ak0_m_ak1;
     ignore = b0_grid_desc_bk0_n_bk1;
-    ignore = d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5;
+    ignore = d0s_griddesc_m_n;
     ignore = b1_grid_desc_bk0_n_bk1;
     ignore = d1s_grid_desc_mblock_mperblock_nblock_nperblock;
     ignore = e1_grid_desc_mblock_mperblock_nblock_nperblock;
@@ -231,6 +235,21 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
 {
     using DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle;
 
+    static constexpr auto Gemm0MXdlPerWave64 = GetNXdlPerWave2<BlockSize,
+                                                               Gemm0NPerBlock,
+                                                               Gemm0MPerBlock,
+                                                               Gemm0NPerXdl,
+                                                               Gemm0MPerXdl,
+                                                               Gemm0NXdlPerWave,
+                                                               true>();
+    static constexpr auto Gemm0MXdlPerWave32 = GetNXdlPerWave2<BlockSize,
+                                                               Gemm0NPerBlock,
+                                                               Gemm0MPerBlock,
+                                                               Gemm0NPerXdl,
+                                                               Gemm0MPerXdl,
+                                                               Gemm0NXdlPerWave,
+                                                               false>();
+
     static constexpr index_t NumD0Tensor = D0sDataType::Size();
     static constexpr index_t NumD1Tensor = D1sDataType::Size();
 
@@ -443,7 +462,8 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
     using E1GridDesc_M_N  = decltype(MakeE1GridDescriptor_M_N<E1Layout>(1, 1, 1));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<
+    template <index_t Gemm0MXdlPerWave_>
+    using GridwiseGemmBase = GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<
         A0DataType, // TODO: distinguish A/B datatype
         Acc0DataType,
         D0sDataType,
@@ -475,7 +495,7 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         B1K1,
         Gemm0MPerXdl,
         Gemm0NPerXdl,
-        Gemm0MXdlPerWave,
+        Gemm0MXdlPerWave_,
         Gemm0NXdlPerWave,
         Gemm1NXdlPerWave,
         A0BlockTransferThreadClusterLengths_AK0_M_AK1,
@@ -509,15 +529,17 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(Gemm0MXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<Gemm0MXdlPerWave32>;
 
     using A0GridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultA0GridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultA0GridDescriptor_AK0_M_AK1(
             A0GridDesc_M_K{}))>;
     using B0GridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultB0GridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultB0GridDescriptor_BK0_N_BK1(
             B0GridDesc_N_K{}))>;
     using B1GridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultB1GridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultB1GridDescriptor_BK0_N_BK1(
             B1GridDesc_N_K{}))>;
 
     // Argument
@@ -565,15 +587,12 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
               e1_grid_desc_m_n_{
                   DeviceOp::MakeE1GridDescriptor_M_N<E1Layout>(MRaw, Gemm1NRaw, StrideE1)},
               a0_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultA0GridDescriptor_AK0_M_AK1(a0_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultA0GridDescriptor_AK0_M_AK1(a0_grid_desc_m_k_)},
               b0_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultB0GridDescriptor_BK0_N_BK1(b0_grid_desc_n_k_)},
-              d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_{},
+                  GridwiseGemm64::MakeDefaultB0GridDescriptor_BK0_N_BK1(b0_grid_desc_n_k_)},
               b1_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultB1GridDescriptor_BK0_N_BK1(b1_grid_desc_n_k_)},
-              d1s_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              e1_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_e1tile_map_{GridwiseGemm::MakeDefaultBlock2E1TileMap(e1_grid_desc_m_n_)},
+                  GridwiseGemm64::MakeDefaultB1GridDescriptor_BK0_N_BK1(b1_grid_desc_n_k_)},
+              block_2_e1tile_map_{GridwiseGemm64::MakeDefaultBlock2E1TileMap(e1_grid_desc_m_n_)},
               a0_element_op_{a0_element_op},
               b0_element_op_{b0_element_op},
               cde0_element_op_{cde0_element_op},
@@ -597,18 +616,6 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                           << ", " << d0s_grid_desc_m_n_[I0].GetLength(I1) << "}" << std::endl;
                 std::cout << "b1_grid_desc_n_k_{" << b1_grid_desc_n_k_.GetLength(I0) << ", "
                           << b1_grid_desc_n_k_.GetLength(I1) << "}" << std::endl;
-                std::cout << "d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_{"
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I0) << ", "
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I1) << ", "
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I2) << ", "
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I3) << ", "
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I4) << ", "
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I5) << ", "
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I6) << ", "
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I7) << ", "
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I8) << ", "
-                          << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I9) << "}"
-                          << std::endl;
                 std::cout << "e1_grid_desc_m_n_{" << e1_grid_desc_m_n_.GetLength(I0) << ", "
                           << e1_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
             }
@@ -636,34 +643,15 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                 d1s_grid_desc_m_n_(i) =
                     DeviceOp::MakeE1GridDescriptor_M_N<D1Layout>(MRaw, Gemm1NRaw, StrideD1s[i]);
             });
-
-            if(GridwiseGemm::CheckValidity(a0_grid_desc_m_k_,
-                                           b0_grid_desc_n_k_,
-                                           b1_grid_desc_n_k_,
-                                           e1_grid_desc_m_n_,
-                                           block_2_e1tile_map_))
-            {
-                e1_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e1_grid_desc_m_n_);
-
-                d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_ =
-                    GridwiseGemm::MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(
-                        d0s_grid_desc_m_n_);
-
-                d1s_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        d1s_grid_desc_m_n_);
-            }
         }
 
         //  private:
         // pointers
         const A0DataType* p_a0_grid_;
         const B0DataType* p_b0_grid_;
-        typename GridwiseGemm::D0sGridPointer p_d0s_grid_;
+        typename GridwiseGemm64::D0sGridPointer p_d0s_grid_;
         const B1DataType* p_b1_grid_;
-        typename GridwiseGemm::D1sGridPointer p_d1s_grid_;
+        typename GridwiseGemm64::D1sGridPointer p_d1s_grid_;
         E1DataType* p_e1_grid_;
 
         // tensor descriptors for problem definiton
@@ -677,16 +665,10 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         // tensor descriptors for block/thread-wise copy
         A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1_;
         B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
-            d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_;
         B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            d1s_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            e1_grid_desc_mblock_mperblock_nblock_nperblock_;
 
         // block-to-e1-tile map
-        typename GridwiseGemm::DefaultBlock2E1TileMap block_2_e1tile_map_;
+        typename GridwiseGemm64::DefaultBlock2E1TileMap block_2_e1tile_map_;
 
         // element-wise op
         A0ElementwiseOperation a0_element_op_;
@@ -705,7 +687,8 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a0_grid_desc_m_k_,
                                             arg.b0_grid_desc_n_k_,
@@ -716,6 +699,14 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
 
+            auto e1_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.e1_grid_desc_m_n_);
+
+            auto d1s_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.d1s_grid_desc_m_n_);
+
             const index_t grid_size =
                 arg.block_2_e1tile_map_.CalculateGridSize(arg.e1_grid_desc_m_n_) * arg.batch_count_;
 
@@ -736,7 +727,7 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                     CDE1ElementwiseOperation,
                     DeviceOp::A0GridDesc_AK0_M_AK1,
                     DeviceOp::B0GridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5,
+                    DeviceOp::D0sGridDesc_M_N,
                     DeviceOp::B1GridDesc_BK0_N_BK1,
                     typename GridwiseGemm::D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
                     typename GridwiseGemm::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -762,10 +753,10 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                                               arg.cde1_element_op_,
                                               arg.a0_grid_desc_ak0_m_ak1_,
                                               arg.b0_grid_desc_bk0_n_bk1_,
-                                              arg.d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_,
+                                              arg.d0s_grid_desc_m_n_,
                                               arg.b1_grid_desc_bk0_n_bk1_,
-                                              arg.d1s_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.e1_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              d1s_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              e1_grid_desc_mblock_mperblock_nblock_nperblock,
                                               arg.block_2_e1tile_map_,
                                               arg.batch_count_,
                                               arg.compute_base_ptr_of_batch_);
@@ -783,6 +774,25 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
             }
         }
 
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(Gemm0MXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64>(arg, stream_config);
+                }
+            }
+            else
+            {
+                if constexpr(Gemm0MXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32>(arg, stream_config);
+                }
+            }
+            return 0;
+        }
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -813,7 +823,7 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<A0DataType, B0DataType, Gemm0MPerXdl, Gemm0NPerXdl>())
         {
             return false;
         }
@@ -836,11 +846,29 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg.a0_grid_desc_m_k_,
-                                           arg.b0_grid_desc_n_k_,
-                                           arg.b1_grid_desc_n_k_,
-                                           arg.e1_grid_desc_m_n_,
-                                           arg.block_2_e1tile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(Gemm0MXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a0_grid_desc_m_k_,
+                                                     arg.b0_grid_desc_n_k_,
+                                                     arg.b1_grid_desc_n_k_,
+                                                     arg.e1_grid_desc_m_n_,
+                                                     arg.block_2_e1tile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(Gemm0MXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a0_grid_desc_m_k_,
+                                                     arg.b0_grid_desc_n_k_,
+                                                     arg.b1_grid_desc_n_k_,
+                                                     arg.e1_grid_desc_m_n_,
+                                                     arg.block_2_e1tile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
index 64d5fbd509..7c8c93cd91 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -37,35 +37,38 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_batched_gemm_xdl_cshuffle_v3_multi_d(BatchedGemmArg karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    const index_t g_idx = blockIdx.z % karg.Batch;
-    const index_t k_idx = blockIdx.z / karg.Batch;
+        const index_t g_idx = blockIdx.z % karg.Batch;
+        const index_t k_idx = blockIdx.z / karg.Batch;
 
-    const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
-    const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+        const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+        const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+        const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+        const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
 
-    // populate pointer, desc for Ds
-    static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
-        // D pointer
-        karg.p_ds_grid(i) = karg.p_ds_grid(i) + ds_batch_offset[i];
-    });
+        // populate pointer, desc for Ds
+        static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
+            // D pointer
+            karg.p_ds_grid(i) = karg.p_ds_grid(i) + ds_batch_offset[i];
+        });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid + c_batch_offset,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid + c_batch_offset,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -83,39 +86,42 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds(BatchedGemmArg karg)
 {
-#if defined(__gfx9__)
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    const index_t g_idx = blockIdx.z % karg.Batch;
-    const index_t k_idx = blockIdx.z / karg.Batch;
+        const index_t g_idx = blockIdx.z % karg.Batch;
+        const index_t k_idx = blockIdx.z / karg.Batch;
 
-    const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
-    const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+        const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+        const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+        const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+        const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
 
-    // populate pointer, desc for Ds
-    static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
-        // D pointer
-        karg.p_ds_grid(i) = karg.p_ds_grid(i) + ds_batch_offset[i];
-    });
+        // populate pointer, desc for Ds
+        static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
+            // D pointer
+            karg.p_ds_grid(i) = karg.p_ds_grid(i) + ds_batch_offset[i];
+        });
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid + c_batch_offset,
-        p_shared_0,
-        p_shared_1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid + c_batch_offset,
+            p_shared_0,
+            p_shared_1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -185,12 +191,17 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                                        BElementwiseOperation,
                                        CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr index_t NumDTensor            = DsDataType::Size();
     using CDEShuffleBlockTransferScalarPerVectors_ = CDEShuffleBlockTransferScalarPerVectors;
     using CDataType_                               = CDataType;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultiD_xdl_cshuffle_v3<
         ALayout,
         BLayout,
         DsLayout,
@@ -214,7 +225,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -241,6 +252,9 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
         ComputeTypeB,
         LDSTypeA,
         LDSTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
+    using GridwiseGemm   = GridwiseGemm64;
 
     struct ComputePtrOffsetOfStridedBatch
     {
@@ -270,7 +284,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
         {
             std::array<long_index_t, NumDTensor> ds_offset_;
 
-            static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
                 ds_offset_[i] = static_cast<long_index_t>(BatchStrideDs_[i]) * g_idx;
             });
 
@@ -289,32 +303,33 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
         index_t BatchStrideC_;
     };
 
-    struct Argument : public GridwiseGemm::Argument
+    template <typename GridwiseGemm>
+    struct ArgumentBase : public GridwiseGemm::Argument
     {
         index_t Batch;
         ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch;
 
-        Argument() = default;
-        Argument(const ADataType* p_a_grid_,
-                 const BDataType* p_b_grid_,
-                 std::array<const void*, NumDTensor> p_ds_grid_,
-                 CDataType* p_e_grid_,
-                 index_t M_,
-                 index_t N_,
-                 index_t K_,
-                 index_t StrideA_,
-                 index_t StrideB_,
-                 std::array<index_t, NumDTensor> StrideDs_,
-                 index_t StrideE_,
-                 index_t BatchStrideA_,
-                 index_t BatchStrideB_,
-                 const std::array<ck::index_t, NumDTensor>& BatchStrideDs_,
-                 index_t BatchStrideE_,
-                 index_t Batch_,
-                 AElementwiseOperation a_element_op_,
-                 BElementwiseOperation b_element_op_,
-                 CElementwiseOperation c_element_op_,
-                 index_t KBatch_)
+        ArgumentBase() = default;
+        ArgumentBase(const ADataType* p_a_grid_,
+                     const BDataType* p_b_grid_,
+                     std::array<const void*, NumDTensor> p_ds_grid_,
+                     CDataType* p_e_grid_,
+                     index_t M_,
+                     index_t N_,
+                     index_t K_,
+                     index_t StrideA_,
+                     index_t StrideB_,
+                     std::array<index_t, NumDTensor> StrideDs_,
+                     index_t StrideE_,
+                     index_t BatchStrideA_,
+                     index_t BatchStrideB_,
+                     const std::array<ck::index_t, NumDTensor>& BatchStrideDs_,
+                     index_t BatchStrideE_,
+                     index_t Batch_,
+                     AElementwiseOperation a_element_op_,
+                     BElementwiseOperation b_element_op_,
+                     CElementwiseOperation c_element_op_,
+                     index_t KBatch_)
             : GridwiseGemm::Argument{p_a_grid_,
                                      p_b_grid_,
                                      p_ds_grid_,
@@ -336,6 +351,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
         {
         }
     };
+    using Argument = ArgumentBase<GridwiseGemm64>;
 
     struct ActiveWorkgroupsPerCU
     {
@@ -359,31 +375,69 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                 }
             }();
 
-            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+            if(get_warp_size() == 64)
             {
-                hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-                    &max_occupancy,
-                    kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
-                        GridwiseGemm,
-                        Argument,
-                        true,
-                        InMemoryDataOperationEnum::AtomicAdd,
-                        minimum_occupancy>,
-                    BlockSize,
-                    dynamic_smem_size));
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                    {
+                        hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                            &max_occupancy,
+                            kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
+                                GridwiseGemm64,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy>,
+                            BlockSize,
+                            dynamic_smem_size));
+                    }
+                    else
+                    {
+                        hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                            &max_occupancy,
+                            kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm64,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy>,
+                            BlockSize,
+                            dynamic_smem_size));
+                    }
+                }
             }
             else
             {
-                hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-                    &max_occupancy,
-                    kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
-                        GridwiseGemm,
-                        Argument,
-                        true,
-                        InMemoryDataOperationEnum::AtomicAdd,
-                        minimum_occupancy>,
-                    BlockSize,
-                    dynamic_smem_size));
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                    {
+                        hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                            &max_occupancy,
+                            kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
+                                GridwiseGemm32,
+                                ArgumentBase<GridwiseGemm32>,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy>,
+                            BlockSize,
+                            dynamic_smem_size));
+                    }
+                    else
+                    {
+                        hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                            &max_occupancy,
+                            kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm32,
+                                ArgumentBase<GridwiseGemm32>,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy>,
+                            BlockSize,
+                            dynamic_smem_size));
+                    }
+                }
             }
 
             max_occupancy_ = std::max(1, max_occupancy);
@@ -394,14 +448,16 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+            using BatchGemmArgument = ArgumentBase<GridwiseGemm>;
             if(stream_config.log_level_ > 0)
             {
                 arg.Print();
             }
 
-            if(!GridwiseGemm::CheckValidity(arg))
+            if(!GridwiseGemm::CheckValidity(reinterpret_cast<const BatchGemmArgument&>(arg)))
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
@@ -423,7 +479,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
 
                     std::array<std::size_t, NumDTensor> DsSize;
 
-                    Argument arg_ = arg;
+                    BatchGemmArgument arg_ = reinterpret_cast<const BatchGemmArgument&>(arg);
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -442,8 +498,12 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
                         DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                     });
-                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    ck::utility::RotatingMemWrapperMultiD<BatchGemmArgument, DsDataType>
+                        rotating_mem(arg_,
+                                     stream_config.rotating_count,
+                                     size_a_buffer,
+                                     size_b_buffer,
+                                     DsSize);
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -480,13 +540,14 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                                                stream_config.stream_id_));
                     };
 
-                    ave_time = launch_and_time_kernel_with_preprocess(stream_config,
+                    BatchGemmArgument arg_ = reinterpret_cast<const BatchGemmArgument&>(arg);
+                    ave_time               = launch_and_time_kernel_with_preprocess(stream_config,
                                                                       clear_workspace,
                                                                       kernel,
                                                                       dim3(gdx, gdy, gdz),
                                                                       dim3(BlockSize),
                                                                       0,
-                                                                      arg);
+                                                                      arg_);
                 }
             };
 
@@ -515,7 +576,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                     {
                         const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                             GridwiseGemm,
-                            Argument,
+                            BatchGemmArgument,
                             true,
                             InMemoryDataOperationEnum::AtomicAdd,
                             minimum_occupancy>;
@@ -525,7 +586,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                     {
                         const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                             GridwiseGemm,
-                            Argument,
+                            BatchGemmArgument,
                             true,
                             InMemoryDataOperationEnum::Set,
                             minimum_occupancy>;
@@ -541,7 +602,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -553,7 +614,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -567,7 +628,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -583,7 +644,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -599,7 +660,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -615,7 +676,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -630,7 +691,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -646,7 +707,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -661,7 +722,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -673,7 +734,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -687,7 +748,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -703,7 +764,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -719,7 +780,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -735,7 +796,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -750,7 +811,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -766,7 +827,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             {
                                 const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                     GridwiseGemm,
-                                    Argument,
+                                    BatchGemmArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -785,7 +846,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -796,7 +857,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -810,7 +871,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -821,7 +882,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -838,7 +899,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -849,7 +910,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -863,7 +924,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -874,7 +935,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                                 GridwiseGemm,
-                                Argument,
+                                BatchGemmArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -893,7 +954,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                     {
                         const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                             GridwiseGemm,
-                            Argument,
+                            BatchGemmArgument,
                             false,
                             InMemoryDataOperationEnum::AtomicAdd,
                             minimum_occupancy>;
@@ -903,7 +964,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                     {
                         const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
                             GridwiseGemm,
-                            Argument,
+                            BatchGemmArgument,
                             false,
                             InMemoryDataOperationEnum::Set,
                             minimum_occupancy>;
@@ -915,6 +976,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -931,11 +994,14 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(is_gfx11_supported() && arg.KBatch > 1)
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -948,8 +1014,22 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -1093,7 +1173,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
index ffebad253b..d2b77a5901 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -60,41 +60,46 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
         const Block2CTileMap block_2_ctile_map)
 {
-#if defined(__gfx9__)
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
-    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
+        const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
 
-    static_for<0, p_reduces_grid.Size(), 1>{}([&](auto In) {
-        const long_index_t d_batch_offset = __builtin_amdgcn_readfirstlane(
-            static_cast<long_index_t>(compute_base_ptr_of_batch_.GetDBasePtr(g_idx, In)));
-        p_reduces_grid(In) = p_reduces_grid(In) + d_batch_offset;
-    });
+        static_for<0, p_reduces_grid.Size(), 1>{}([&](auto In) {
+            const long_index_t d_batch_offset = __builtin_amdgcn_readfirstlane(
+                static_cast<long_index_t>(compute_base_ptr_of_batch_.GetDBasePtr(g_idx, In)));
+            p_reduces_grid(In) = p_reduces_grid(In) + d_batch_offset;
+        });
 
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid + a_batch_offset,
-                                                   p_b_grid + b_batch_offset,
-                                                   p_c_grid + c_batch_offset,
-                                                   p_reduces_grid,
-                                                   p_shared,
-                                                   a_element_op,
-                                                   b_element_op,
-                                                   c_element_op,
-                                                   reduce_in_element_ops,
-                                                   reduce_out_element_ops,
-                                                   a_grid_desc_ak0_m_ak1,
-                                                   b_grid_desc_bk0_n_bk1,
-                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                   reduce_grid_desc_mblock_mperblock,
-                                                   block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainK0BlockLoop>(
+            p_a_grid + a_batch_offset,
+            p_b_grid + b_batch_offset,
+            p_c_grid + c_batch_offset,
+            p_reduces_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            c_element_op,
+            reduce_in_element_ops,
+            reduce_out_element_ops,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+            reduce_grid_desc_mblock_mperblock,
+            block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -172,6 +177,9 @@ template <typename ALayout,
 struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperations::Size()>
 {
     using DeviceOp = DeviceBatchedGemmReduce_Xdl_CShuffle;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -517,7 +525,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
     };
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -546,7 +555,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -571,6 +580,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
         CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
         CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -600,32 +611,18 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
               b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
               c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
               reduce_grid_desc_m_{DeviceOp::MakeReduceGridDescriptor_M(MRaw)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              reduce_grid_desc_mblock_mperblock_{},
               compute_base_ptr_of_batch_{
                   type_convert<index_t>(a_grid_desc_ak0_m_ak1_.GetElementSpaceSize()),
                   type_convert<index_t>(b_grid_desc_bk0_n_bk1_.GetElementSpaceSize()),
                   type_convert<index_t>(c_grid_desc_m_n_.GetElementSpaceSize()),
                   type_convert<index_t>(reduce_grid_desc_m_.GetElementSpaceSize())},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              block_2_ctile_map_{GridwiseGemm64::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               c_element_op_{c_element_op},
               reduce_in_element_ops_{reduce_in_element_ops},
               reduce_out_element_ops_{reduce_out_element_ops}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-
-                reduce_grid_desc_mblock_mperblock_ =
-                    GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(reduce_grid_desc_m_);
-            }
         }
 
         //  private:
@@ -638,12 +635,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
         CGridDesc_M_N c_grid_desc_m_n_;
         ReduceGridDesc_M reduce_grid_desc_m_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock
-            reduce_grid_desc_mblock_mperblock_;
         ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        typename GridwiseGemm64::DefaultBlock2CTileMap block_2_ctile_map_;
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
         CElementwiseOperation c_element_op_;
@@ -656,8 +649,24 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c_grid_desc_m_n_);
+
+            auto reduce_grid_desc_mblock_mperblock =
+                GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(arg.reduce_grid_desc_m_);
+
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
                 {
@@ -680,15 +689,6 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
                               << arg.reduce_grid_desc_m_.GetLength(I0) << "}" << std::endl;
                 }
             }
-
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                            arg.b_grid_desc_bk0_n_bk1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
-            {
-                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
-            }
-
             const index_t grid_size =
                 arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Batch_;
 
@@ -716,28 +716,27 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     true>;
 
-                elapsed_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.p_reduces_grid_,
-                                           arg.Batch_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.reduce_in_element_ops_,
-                                           arg.reduce_out_element_ops_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.reduce_grid_desc_mblock_mperblock_,
-                                           arg.compute_base_ptr_of_batch_,
-                                           arg.block_2_ctile_map_);
+                elapsed_time = launch_and_time_kernel(stream_config,
+                                                      kernel,
+                                                      dim3(grid_size),
+                                                      dim3(BlockSize),
+                                                      0,
+                                                      arg.p_a_grid_,
+                                                      arg.p_b_grid_,
+                                                      arg.p_c_grid_,
+                                                      arg.p_reduces_grid_,
+                                                      arg.Batch_,
+                                                      arg.a_element_op_,
+                                                      arg.b_element_op_,
+                                                      arg.c_element_op_,
+                                                      arg.reduce_in_element_ops_,
+                                                      arg.reduce_out_element_ops_,
+                                                      arg.a_grid_desc_ak0_m_ak1_,
+                                                      arg.b_grid_desc_bk0_n_bk1_,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      reduce_grid_desc_mblock_mperblock,
+                                                      arg.compute_base_ptr_of_batch_,
+                                                      arg.block_2_ctile_map_);
             }
             else
             {
@@ -759,33 +758,34 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     false>;
 
-                elapsed_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.p_reduces_grid_,
-                                           arg.Batch_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.reduce_in_element_ops_,
-                                           arg.reduce_out_element_ops_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.reduce_grid_desc_mblock_mperblock_,
-                                           arg.compute_base_ptr_of_batch_,
-                                           arg.block_2_ctile_map_);
+                elapsed_time = launch_and_time_kernel(stream_config,
+                                                      kernel,
+                                                      dim3(grid_size),
+                                                      dim3(BlockSize),
+                                                      0,
+                                                      arg.p_a_grid_,
+                                                      arg.p_b_grid_,
+                                                      arg.p_c_grid_,
+                                                      arg.p_reduces_grid_,
+                                                      arg.Batch_,
+                                                      arg.a_element_op_,
+                                                      arg.b_element_op_,
+                                                      arg.c_element_op_,
+                                                      arg.reduce_in_element_ops_,
+                                                      arg.reduce_out_element_ops_,
+                                                      arg.a_grid_desc_ak0_m_ak1_,
+                                                      arg.b_grid_desc_bk0_n_bk1_,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      reduce_grid_desc_mblock_mperblock,
+                                                      arg.compute_base_ptr_of_batch_,
+                                                      arg.block_2_ctile_map_);
             }
 
             return elapsed_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -802,15 +802,31 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index d835bb6c61..745cf2b722 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -36,7 +36,7 @@ template <typename GridwiseGemm,
           typename BGridDesc_BK0_N_BK1,
           typename B1GridDesc_BK0_N_BK1,
           typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5,
+          typename D0sGridDesc_M_N,
           typename Block2CTileMap,
           typename ComputeBasePtrOfStridedBatch,
           typename C0MatrixMask,
@@ -61,52 +61,56 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
         const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
             c1_grid_desc_mblock_mperblock_nblock_nperblock,
-        const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
-            d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+        const D0sGridDesc_M_N d0s_griddesc_m_n,
         const Block2CTileMap block_2_ctile_map,
         const index_t batch_count,
         const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
         const C0MatrixMask c0_matrix_mask)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
-    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
-    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+        const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+        const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
 
-    static_for<0, p_d0s_grid.Size(), 1>{}([&](auto In) {
-        const long_index_t d0_batch_offset = __builtin_amdgcn_readfirstlane(
-            static_cast<long_index_t>(compute_base_ptr_of_batch.GetD0BasePtr(g_idx, In)));
-        p_d0s_grid(In) = p_d0s_grid(In) + d0_batch_offset;
-    });
+        static_for<0, p_d0s_grid.Size(), 1>{}([&](auto In) {
+            const long_index_t d0_batch_offset = __builtin_amdgcn_readfirstlane(
+                static_cast<long_index_t>(compute_base_ptr_of_batch.GetD0BasePtr(g_idx, In)));
+            p_d0s_grid(In) = p_d0s_grid(In) + d0_batch_offset;
+        });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_b1_grid + b1_batch_offset,
-                                                  p_c_grid + c_batch_offset,
-                                                  p_d0s_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c0de_element_op,
-                                                  b1_element_op,
-                                                  c1de_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  b1_grid_desc_bk0_n_bk1,
-                                                  c1_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                                                  block_2_ctile_map,
-                                                  c0_matrix_mask);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid + a_batch_offset,
+            p_b_grid + b_batch_offset,
+            p_b1_grid + b1_batch_offset,
+            p_c_grid + c_batch_offset,
+            p_d0s_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            c0de_element_op,
+            b1_element_op,
+            c1de_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            b1_grid_desc_bk0_n_bk1,
+            c1_grid_desc_mblock_mperblock_nblock_nperblock,
+            d0s_griddesc_m_n,
+            block_2_ctile_map,
+            c0_matrix_mask);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -122,7 +126,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
     ignore = b_grid_desc_bk0_n_bk1;
     ignore = b1_grid_desc_bk0_n_bk1;
     ignore = c1_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5;
+    ignore = d0s_griddesc_m_n;
     ignore = block_2_ctile_map;
     ignore = batch_count;
     ignore = compute_base_ptr_of_batch;
@@ -218,6 +222,11 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                                                  C1DEElementwiseOperation,
                                                  MaskingSpec>
 {
+    static constexpr auto MXdlPerWave64 =
+        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
+    static constexpr auto MXdlPerWave32 =
+        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
+
     static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0,
                   "Number of dimension must be greater than 0");
 
@@ -377,7 +386,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
     };
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle<
+    template <index_t MXdlPerWave_>
+    using GridwiseGemmBase = GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle<
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -406,7 +416,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
         B1K1,
         MPerXDL,
         NPerXDL,
-        MXdlPerWave,
+        MXdlPerWave_,
         NXdlPerWave,
         Gemm1NXdlPerWave,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
@@ -441,6 +451,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
         Transform::matrix_padder.PadN,
         MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle,
         D0sTransferSrcScalarPerVector>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(MXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<MXdlPerWave32>;
 
     // Argument
     // FIXME: constness
@@ -485,6 +497,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                   b1_gs_gemm1ns_gemm1ks_lengths, b1_gs_gemm1ns_gemm1ks_strides)},
               c1_grid_desc_m_n_{Transform::MakeCGridDescriptor_M_N(c_gs_ms_gemm1ns_lengths,
                                                                    c_gs_ms_gemm1ns_strides)},
+              d0s_grid_desc_m_n_{DeviceOp::MakeD0sGridDescriptor_M_N(acc0_biases_gs_ms_ns_lengths,
+                                                                     acc0_biases_gs_ms_ns_strides)},
               a_grid_desc_g_m_k_{
                   Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)},
               b_grid_desc_g_n_k_{
@@ -495,9 +509,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                                                                        c_gs_ms_gemm1ns_strides)},
               d0s_grid_desc_g_m_n_{DeviceOp::MakeD0sGridDescriptor_G_M_N(
                   acc0_biases_gs_ms_ns_lengths, acc0_biases_gs_ms_ns_strides)},
-              c1_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_{},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c1_grid_desc_m_n_)},
+              block_2_ctile_map_{GridwiseGemm64::MakeDefaultBlock2CTileMap(c1_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               c0de_element_op_{c0de_element_op},
@@ -538,23 +550,6 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                 d0s_nl_ns_lengths_strides_[i].push_back(
                     acc0_biases_gs_ms_ns_strides[i][NumDimG + NumDimM]);
             });
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           b1_grid_desc_bk0_n_bk1_,
-                                           c1_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c1_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeC1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c1_grid_desc_m_n_);
-
-                D0sGridDesc_M_N d0s_grid_desc_m_n{DeviceOp::MakeD0sGridDescriptor_M_N(
-                    acc0_biases_gs_ms_ns_lengths, acc0_biases_gs_ms_ns_strides)};
-                d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_ =
-                    GridwiseGemm::MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(
-                        d0s_grid_desc_m_n);
-            }
         }
 
         void Print() const
@@ -578,26 +573,22 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
         const BDataType* p_b_grid_;
         const B1DataType* p_b1_grid_;
         CDataType* p_c_grid_;
-        typename GridwiseGemm::D0sGridPointer p_d0s_grid_;
+        typename GridwiseGemm64::D0sGridPointer p_d0s_grid_;
 
         // tensor descriptor
         AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
         B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
         C1GridDesc_M_N c1_grid_desc_m_n_;
+        D0sGridDesc_M_N d0s_grid_desc_m_n_;
         AGridDesc_G_M_K a_grid_desc_g_m_k_;
         BGridDesc_G_N_K b_grid_desc_g_n_k_;
         B1GridDesc_G_N_K b1_grid_desc_g_n_k_;
         C1GridDesc_G_M_N c1_grid_desc_g_m_n_;
         D0sGridDesc_G_M_N d0s_grid_desc_g_m_n_;
 
-        typename GridwiseGemm::C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c1_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
-            d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_;
-
         // block-to-c-tile map
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        typename GridwiseGemm64::DefaultBlock2CTileMap block_2_ctile_map_;
 
         // element-wise op
         AElementwiseOperation a_element_op_;
@@ -626,12 +617,16 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!DeviceOp::IsSupportedArgument(arg))
             {
                 throw std::runtime_error("wrong! unsupported argument");
             }
+            auto c1_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeC1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c1_grid_desc_m_n_);
 
             const index_t grid_size =
                 arg.block_2_ctile_map_.CalculateGridSize(arg.c1_grid_desc_m_n_) * arg.batch_count_;
@@ -657,7 +652,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                     DeviceOp::BGridDesc_BK0_N_BK1,
                     DeviceOp::B1GridDesc_BK0_N_BK1,
                     typename GridwiseGemm::C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5,
+                    D0sGridDesc_M_N,
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     ComputeBasePtrOfStridedBatch,
                     C0MatrixMask,
@@ -681,8 +676,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                                               arg.a_grid_desc_ak0_m_ak1_,
                                               arg.b_grid_desc_bk0_n_bk1_,
                                               arg.b1_grid_desc_bk0_n_bk1_,
-                                              arg.c1_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_,
+                                              c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              arg.d0s_grid_desc_m_n_,
                                               arg.block_2_ctile_map_,
                                               arg.batch_count_,
                                               arg.compute_base_ptr_of_batch_,
@@ -703,6 +698,25 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
             return ave_time;
         }
 
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(MXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64>(arg, stream_config);
+                }
+            }
+            else
+            {
+                if constexpr(MXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32>(arg, stream_config);
+                }
+            }
+            return 0;
+        }
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -724,11 +738,10 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
             arg.Print();
         }
 
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         // TODO ANT: Check if tensor specialization & strides mismatch
 
         // Check if C permute dimension matches GEMM + GEMM shape
@@ -792,12 +805,29 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                 return false;
             }
         }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.b1_grid_desc_bk0_n_bk1_,
-                                           arg.c1_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(MXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.b1_grid_desc_bk0_n_bk1_,
+                                                     arg.c1_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(MXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.b1_grid_desc_bk0_n_bk1_,
+                                                     arg.c1_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
index 1345d2b782..77379c8fb1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -1,16 +1,18 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
-
 #ifndef __HIPCC_RTC__
 #include <iostream>
 #include <sstream>
+#endif
+
+#include "ck/utility/common_header.hpp"
+#ifndef __HIPCC_RTC__
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #endif
 
-#include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -64,37 +66,41 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
         const C0MatrixMask c0_matrix_mask)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
-    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
-    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+        const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+        const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_b1_grid + b1_batch_offset,
-                                                  p_c_grid + c_batch_offset,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  acc_element_op,
-                                                  b1_element_op,
-                                                  c_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  b1_grid_desc_bk0_n_bk1,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_ctile_map,
-                                                  c0_matrix_mask);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                      p_b_grid + b_batch_offset,
+                                                      p_b1_grid + b1_batch_offset,
+                                                      p_c_grid + c_batch_offset,
+                                                      p_shared,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      acc_element_op,
+                                                      b1_element_op,
+                                                      c_element_op,
+                                                      a_grid_desc_ak0_m_ak1,
+                                                      b_grid_desc_bk0_n_bk1,
+                                                      b1_grid_desc_bk0_n_bk1,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      block_2_ctile_map,
+                                                      c0_matrix_mask);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -202,7 +208,12 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
                                           CElementwiseOperation,
                                           MaskOutUpperTriangle>
 {
+
     using DeviceOp = DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle;
+    static constexpr auto MXdlPerWave64 =
+        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
+    static constexpr auto MXdlPerWave32 =
+        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -369,7 +380,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
                                        C0MatrixMask_impl<MaskDisabledPredicate>>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+    template <index_t MXdlPerWave_>
+    using GridwiseGemmBase = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -396,7 +408,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         B1K1,
         MPerXDL,
         NPerXDL,
-        MXdlPerWave,
+        MXdlPerWave_,
         NXdlPerWave,
         Gemm1NXdlPerWave,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
@@ -430,6 +442,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         LoopSched,
         matrix_padder.PadN,
         MaskOutUpperTriangle>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(MXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<MXdlPerWave32>;
 
 #ifndef __HIPCC_RTC__
     // Argument
@@ -466,8 +480,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
               b1_grid_desc_bk0_n_bk1_{
                   DeviceOp::MakeB1GridDescriptor_BK0_N_BK1(NRaw, Gemm1NRaw, StrideB1)},
               c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, Gemm1NRaw, StrideC)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              block_2_ctile_map_{GridwiseGemm64::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               acc_element_op_{acc_element_op},
@@ -478,16 +491,6 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
               c0_matrix_mask_{NRaw},
               raw_lengths_m_n_k_o_{MRaw, NRaw, KRaw, Gemm1NRaw}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           b1_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-            }
         }
 
         //  private:
@@ -499,9 +502,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
         B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
         CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        typename GridwiseGemm64::DefaultBlock2CTileMap block_2_ctile_map_;
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
         AccElementwiseOperation acc_element_op_;
@@ -522,7 +523,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
                                             arg.b_grid_desc_bk0_n_bk1_,
@@ -532,7 +534,9 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
-
+            auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c_grid_desc_m_n_);
             const index_t grid_size =
                 arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_;
 
@@ -578,7 +582,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
                                               arg.a_grid_desc_ak0_m_ak1_,
                                               arg.b_grid_desc_bk0_n_bk1_,
                                               arg.b1_grid_desc_bk0_n_bk1_,
-                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              c_grid_desc_mblock_mperblock_nblock_nperblock,
                                               arg.block_2_ctile_map_,
                                               arg.batch_count_,
                                               arg.compute_base_ptr_of_batch_,
@@ -599,6 +603,25 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
             return ave_time;
         }
 
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(MXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64>(arg, stream_config);
+                }
+            }
+            else
+            {
+                if constexpr(MXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32>(arg, stream_config);
+                }
+            }
+            return 0;
+        }
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -707,11 +730,10 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
 #ifndef __HIPCC_RTC__
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         // Note: we need raw lengths since threadwise copy can not handle vector load when part of
         // vector is out of bounds
         const auto MRaw      = arg.raw_lengths_m_n_k_o_[0];
@@ -719,12 +741,31 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         const auto KRaw      = arg.raw_lengths_m_n_k_o_[2];
         const auto Gemm1NRaw = arg.raw_lengths_m_n_k_o_[3];
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.b1_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_) and
-               IsSupported(MRaw, NRaw, KRaw, Gemm1NRaw);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(MXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.b1_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_) and
+                       IsSupported(MRaw, NRaw, KRaw, Gemm1NRaw);
+            }
+        }
+        else
+        {
+            if constexpr(MXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.b1_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_) and
+                       IsSupported(MRaw, NRaw, KRaw, Gemm1NRaw);
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -916,7 +957,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         using CGridDesc_M_N = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(CDesc{}))>;
 
         // GridwiseGemm
-        using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+        template <index_t MXdlPerWave_>
+        using GridwiseGemmBase = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
             ADataType, // TODO: distinguish A/B datatype
             GemmAccDataType,
             CShuffleDataType,
@@ -943,7 +985,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
             B1K1,
             MPerXDL,
             NPerXDL,
-            MXdlPerWave,
+            MXdlPerWave_,
             NXdlPerWave,
             Gemm1NXdlPerWave,
             ABlockTransferThreadClusterLengths_AK0_M_AK1,
@@ -978,13 +1020,16 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
             matrix_padder.PadN,
             MaskOutUpperTriangle>;
 
+        using GridwiseGemm64 = GridwiseGemmBase<math::max(MXdlPerWave64, 1)>;
+        using GridwiseGemm32 = GridwiseGemmBase<MXdlPerWave32>;
+
         AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1;
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1;
         B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1;
         CGridDesc_M_N c_grid_desc_m_n;
         C0MatrixMask c0_matrix_mask;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        typename GridwiseGemm64::DefaultBlock2CTileMap block_2_ctile_map;
+        typename GridwiseGemm64::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
             c_grid_descriptor_mblock_mperblock_nblock_nperblock;
 
         // element-wise op
@@ -1008,30 +1053,54 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
               b_grid_desc_bk0_n_bk1{MakeBGridDescriptor_BK0_N_BK1(b)},
               b1_grid_desc_bk0_n_bk1{MakeB1GridDescriptor_BK0_N_BK1(b1)},
               c_grid_desc_m_n{MakeCGridDescriptor_M_N(c)},
-              block_2_ctile_map{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n)},
+              block_2_ctile_map{GridwiseGemm64::MakeDefaultBlock2CTileMap(c_grid_desc_m_n)},
               c_grid_descriptor_mblock_mperblock_nblock_nperblock{
-                  GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                  GridwiseGemm64::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                       c_grid_desc_m_n)},
-              has_main_k_block_loop{GridwiseGemm::CalculateHasMainKBlockLoop(
+              has_main_k_block_loop{GridwiseGemm64::CalculateHasMainKBlockLoop(
                   a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2))},
               c0_matrix_mask{c.GetLength(I1)},
               a_element_op{a_element_op_},
               b_element_op{b_element_op_},
               b1_element_op{b1_element_op_},
               c_element_op{c_element_op_},
-              is_valid{GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1,
-                                                   b_grid_desc_bk0_n_bk1,
-                                                   b1_grid_desc_bk0_n_bk1,
-                                                   c_grid_desc_m_n,
-                                                   block_2_ctile_map) and
-                       IsSupported(a_grid_desc_ak0_m_ak1.GetLength(I1),
-                                   b_grid_desc_bk0_n_bk1.GetLength(I1),
-                                   a_grid_desc_ak0_m_ak1.GetLength(I0) *
-                                       a_grid_desc_ak0_m_ak1.GetLength(I2),
-                                   b1_grid_desc_bk0_n_bk1.GetLength(I1))}
+              is_valid{false}
         {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(MXdlPerWave64 > 0)
+                {
+                    is_valid = GridwiseGemm64::CheckValidity(a_grid_desc_ak0_m_ak1,
+                                                             b_grid_desc_bk0_n_bk1,
+                                                             b1_grid_desc_bk0_n_bk1,
+                                                             c_grid_desc_m_n,
+                                                             block_2_ctile_map) and
+                               IsSupported(a_grid_desc_ak0_m_ak1.GetLength(I1),
+                                           b_grid_desc_bk0_n_bk1.GetLength(I1),
+                                           a_grid_desc_ak0_m_ak1.GetLength(I0) *
+                                               a_grid_desc_ak0_m_ak1.GetLength(I2),
+                                           b1_grid_desc_bk0_n_bk1.GetLength(I1)) and
+                               GridwiseGemm64::template IsValidCompilationParameter<>();
+                }
+            }
+            else
+            {
+                if constexpr(MXdlPerWave32 > 0)
+                {
+                    is_valid = GridwiseGemm32::CheckValidity(a_grid_desc_ak0_m_ak1,
+                                                             b_grid_desc_bk0_n_bk1,
+                                                             b1_grid_desc_bk0_n_bk1,
+                                                             c_grid_desc_m_n,
+                                                             block_2_ctile_map) and
+                               IsSupported(a_grid_desc_ak0_m_ak1.GetLength(I1),
+                                           b_grid_desc_bk0_n_bk1.GetLength(I1),
+                                           a_grid_desc_ak0_m_ak1.GetLength(I0) *
+                                               a_grid_desc_ak0_m_ak1.GetLength(I2),
+                                           b1_grid_desc_bk0_n_bk1.GetLength(I1)) and
+                               GridwiseGemm32::template IsValidCompilationParameter<>();
+                }
+            }
         }
-
         constexpr bool IsValid() const { return is_valid; }
     };
 
@@ -1061,12 +1130,15 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
 #ifndef __HIPCC_RTC__
         assert(desc.is_valid);
 #endif
-        __shared__ char p_shared_block[Desc::GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        using GridwiseGemm = conditional_t<get_warp_size() == 64,
+                                           typename Desc::GridwiseGemm64,
+                                           typename Desc::GridwiseGemm32>;
+        __shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
         AccElementwiseOperation acc_element_op{scale};
 
         if(desc.has_main_k_block_loop)
         {
-            Desc::GridwiseGemm::template Run<true>(
+            GridwiseGemm::template Run<true>(
                 p_a_grid,
                 p_b_grid,
                 p_b1_grid,
@@ -1086,7 +1158,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         }
         else
         {
-            Desc::GridwiseGemm::template Run<false>(
+            GridwiseGemm::template Run<false>(
                 p_a_grid,
                 p_b_grid,
                 p_b1_grid,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
index 5d983afb9b..e305dbfd9a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
@@ -40,7 +40,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #if(defined(__gfx11__) || defined(__gfx12__))
 #if defined(__gfx11__)
     // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
-    using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
+    using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_e_grid)>>;
     if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
                    (std::is_same_v<c_data_type, ck::half_t> ||
                     std::is_same_v<c_data_type, ck::bhalf_t>)))
@@ -62,14 +62,36 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 
         __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+        // shift A matrices pointer for splitk
+        typename GridwiseGemm::AsGridPointer p_as_grid_shift;
+        static_for<0, GridwiseGemm::NumATensor, 1>{}([&](auto i) {
+            using ADataType_ =
+                remove_cvref_t<tuple_element_t<i.value, typename GridwiseGemm::AsDataType_>>;
+            p_as_grid_shift(i) = static_cast<const ADataType_*>(karg.p_as_grid[i]) +
+                                 splitk_batch_offset.a_k_split_offset[i] + a_batch_offset;
+        });
+
+        // shift B matrices pointer for splitk
+        typename GridwiseGemm::BsGridPointer p_bs_grid_shift;
+        static_for<0, GridwiseGemm::NumBTensor, 1>{}([&](auto i) {
+            using BDataType_ =
+                remove_cvref_t<tuple_element_t<i.value, typename GridwiseGemm::BsDataType_>>;
+            p_bs_grid_shift(i) = static_cast<const BDataType_*>(karg.p_bs_grid[i]) +
+                                 splitk_batch_offset.b_k_split_offset[i] + b_batch_offset;
+        });
 
         GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-            karg.p_a_grid + splitk_batch_offset.a_k_split_offset + a_batch_offset,
-            karg.p_b_grid + splitk_batch_offset.b_k_split_offset + b_batch_offset,
-            karg.p_c_grid + splitk_batch_offset.c_reduce_offset + c_batch_offset,
+            p_as_grid_shift,
+            p_bs_grid_shift,
+            karg.p_ds_grid,
+            karg.p_e_grid + splitk_batch_offset.c_reduce_offset + c_batch_offset,
             p_shared,
-            karg);
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.cde_element_op);
 #if defined(__gfx11__)
     }
 #endif
@@ -272,11 +294,13 @@ struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
     using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3<
         ALayout,
         BLayout,
+        Tuple<>, // DsLayout
         CLayout,
-        ADataType,
-        BDataType,
+        Tuple<ADataType>,
+        Tuple<BDataType>,
         AccDataType,
         CShuffleDataType,
+        Tuple<>, // DsDataType
         CDataType,
         AElementwiseOperation,
         BElementwiseOperation,
@@ -311,7 +335,7 @@ struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
         CShuffleMRepeatPerShuffle,
         CShuffleNRepeatPerShuffle,
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        Sequence<CShuffleBlockTransferScalarPerVector_NPerBlock>,
         BlkGemmPipeSched,
         BlkGemmPipelineVer,
         ComputeTypeA,
@@ -336,17 +360,25 @@ struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
                           index_t BatchStrideC_,
                           index_t Batch_,
                           index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation cde_element_op_,
                           bool is_reduce_ = false)
-            : GridwiseGemm::Argument(p_a_grid_,
-                                     p_b_grid_,
+            : GridwiseGemm::Argument(std::array<const void*, 1>{p_a_grid_},
+                                     std::array<const void*, 1>{p_b_grid_},
+                                     std::array<const void*, 0>{}, // p_ds_grid_
                                      p_c_grid_,
                                      M_,
                                      N_,
                                      K_,
-                                     StrideA_,
-                                     StrideB_,
+                                     std::array<index_t, 1>{StrideA_},
+                                     std::array<index_t, 1>{StrideB_},
+                                     std::array<index_t, 0>{}, // StrideDs_
                                      StrideC_,
                                      k_batch_,
+                                     a_element_op_,
+                                     b_element_op_,
+                                     cde_element_op_,
                                      is_reduce_),
               Batch(Batch_),
               compute_ptr_offset_of_batch{BatchStrideA_, BatchStrideB_, BatchStrideC_}
@@ -409,26 +441,33 @@ struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
                 {
                     Argument arg_ = arg;
 
-                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
-                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
-                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
-                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideAs, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideBs, arg_.BK0);
 
                     // Packed sizes are 1 for all implemented data types but we include it anyway
                     // for future compatibility.
-                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
-                                         sizeof(ADataType) / GridwiseGemm::APackedSize;
-                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
-                                         sizeof(BDataType) / GridwiseGemm::BPackedSize;
-
                     // Note: the grid descriptors and size_a / size_b do *not* take batching into
                     // account, so we have to manually multiply overall buffer sizes for rotating
                     // memory by batch.
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
-                        arg_,
-                        stream_config.rotating_count,
-                        arg_.Batch * size_a_buffer,
-                        arg_.Batch * size_b_buffer);
+                    std::array<std::size_t, 1> size_as_buffers;
+                    size_as_buffers[0] = a_grid_desc_ak0_m_ak1[Number<0>{}].GetElementSpaceSize() *
+                                         sizeof(ADataType) / GridwiseGemm::APackedSize * arg_.Batch;
+
+                    std::array<std::size_t, 1> size_bs_buffers;
+                    size_bs_buffers[0] = b_grid_desc_bk0_n_bk1[Number<0>{}].GetElementSpaceSize() *
+                                         sizeof(BDataType) / GridwiseGemm::BPackedSize * arg_.Batch;
+
+                    ck::utility::RotatingMemWrapperMultiABD<Argument,
+                                                            Tuple<ADataType>,
+                                                            Tuple<BDataType>,
+                                                            Tuple<>>
+                        rotating_mem(arg_,
+                                     stream_config.rotating_count,
+                                     size_as_buffers,
+                                     size_bs_buffers,
+                                     std::array<std::size_t, 0>{});
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -443,7 +482,7 @@ struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
                             // Note: This seems incorrect for non-contiguous memory layouts for C
                             // (padding, gaps).
                             HIP_CHECK_ERROR(
-                                hipMemsetAsync(arg_.p_c_grid,
+                                hipMemsetAsync(arg_.p_e_grid,
                                                0,
                                                arg_.Batch * arg_.M * arg_.N * sizeof(CDataType),
                                                stream_config.stream_id_));
@@ -469,7 +508,7 @@ struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
                             // Note: This seems incorrect for non-contiguous memory layouts for C
                             // (padding, gaps).
                             HIP_CHECK_ERROR(
-                                hipMemsetAsync(arg.p_c_grid,
+                                hipMemsetAsync(arg.p_e_grid,
                                                0,
                                                arg.Batch * arg.M * arg.N * sizeof(CDataType),
                                                stream_config.stream_id_));
@@ -658,7 +697,10 @@ struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
                         BatchStrideB,
                         BatchStrideC,
                         Batch,
-                        1 /* KBatch */};
+                        1, /* KBatch */
+                        AElementwiseOperation{},
+                        BElementwiseOperation{},
+                        CElementwiseOperation{}};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -694,7 +736,10 @@ struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
                                           BatchStrideB,
                                           BatchStrideC,
                                           Batch,
-                                          1); // KBatch
+                                          1,
+                                          AElementwiseOperation{},
+                                          BElementwiseOperation{},
+                                          CElementwiseOperation{}); // KBatch
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
index d3f067f170..b8f03e742f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -52,36 +52,40 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
     kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
 {
-#if defined(__gfx9__)
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / karg.Batch);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / karg.Batch);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
 
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    const auto a_grid_desc_k0_m_k1 =
-        amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(
-            karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
-    const auto b_grid_desc_k0_n_k1 =
-        amd_wave_read_first_lane(GridwiseGemm::MakeBGridDescriptor_K0_N_K1(
-            karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
-    const auto c_grid_desc_m_n = amd_wave_read_first_lane(GridwiseGemm::MakeCGridDescriptor_M_N(
-        karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
+        const auto a_grid_desc_k0_m_k1 =
+            amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(
+                karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
+        const auto b_grid_desc_k0_n_k1 =
+            amd_wave_read_first_lane(GridwiseGemm::MakeBGridDescriptor_K0_N_K1(
+                karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
+        const auto c_grid_desc_m_n = amd_wave_read_first_lane(GridwiseGemm::MakeCGridDescriptor_M_N(
+            karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid + a_batch_offset,
-                                                  karg.p_b_grid + b_batch_offset,
-                                                  karg.p_c_grid + c_batch_offset,
-                                                  p_shared,
-                                                  a_grid_desc_k0_m_k1,
-                                                  b_grid_desc_k0_n_k1,
-                                                  c_grid_desc_m_n);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid + a_batch_offset,
+                                                      karg.p_b_grid + b_batch_offset,
+                                                      karg.p_c_grid + c_batch_offset,
+                                                      p_shared,
+                                                      a_grid_desc_k0_m_k1,
+                                                      b_grid_desc_k0_n_k1,
+                                                      c_grid_desc_m_n);
+    }
 #else
     ignore = karg;
 #endif
@@ -135,6 +139,10 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                                                        BElementwiseOperation,
                                                        CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -172,7 +180,8 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
     };
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -192,7 +201,7 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -215,8 +224,10 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
         NumGemmKPrefetchStage,
         LoopSched,
         PipelineVer>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Problem = typename GridwiseGemm::Problem;
+    using Problem = typename GridwiseGemm64::Problem;
 
     // Argument
     struct Argument : public Problem, public BaseArgument
@@ -255,14 +266,17 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
     {
         using Argument = DeviceBatchedGemmXdl::Argument;
 
-        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
                 karg.Print();
             }
 
-            if(!GridwiseGemm::CheckValidity(karg))
+            typename GridwiseGemm::Problem arg(
+                karg.M, karg.N, karg.K, karg.StrideA, karg.StrideB, karg.StrideC);
+            if(!GridwiseGemm::CheckValidity(arg))
             {
                 throw std::runtime_error(
                     "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext has invalid setting");
@@ -293,6 +307,8 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -309,12 +325,31 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
 
     static bool IsSupportedArgument(const Problem& problem)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(problem);
+        // temp disable on gfx11
+        if(ck::is_gfx11_supported())
+        {
+            return false;
+        }
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(problem);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Problem&>(problem));
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
index 459ebd7f35..d19810694b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
@@ -37,27 +37,30 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_batched_gemm_b_scale_xdl_cshuffle_v3(BatchedGemmArg karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    const index_t g_idx = blockIdx.z % karg.Batch;
-    const index_t k_idx = blockIdx.z / karg.Batch;
+        const index_t g_idx = blockIdx.z % karg.Batch;
+        const index_t k_idx = blockIdx.z / karg.Batch;
 
-    const auto a_batch_offset       = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const auto b_batch_offset       = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const auto c_batch_offset       = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
-    const auto b_scale_batch_offset = karg.compute_ptr_offset_of_batch.GetSacleBPtrOffset(g_idx);
+        const auto a_batch_offset = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+        const auto b_batch_offset = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+        const auto c_batch_offset = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+        const auto b_scale_batch_offset =
+            karg.compute_ptr_offset_of_batch.GetSacleBPtrOffset(g_idx);
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
-
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + c_batch_offset + splitk_batch_offset.c_reduce_offset,
-        karg.p_b_scale_grid + b_scale_batch_offset + splitk_batch_offset.scale_k_split_offset,
-        p_shared,
-        karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
 
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + c_batch_offset + splitk_batch_offset.c_reduce_offset,
+            karg.p_b_scale_grid + b_scale_batch_offset + splitk_batch_offset.scale_k_split_offset,
+            p_shared,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -75,31 +78,34 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds(BatchedGemmArg karg)
 {
-#if defined(__gfx9__)
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    const index_t g_idx = blockIdx.z % karg.Batch;
-    const index_t k_idx = blockIdx.z / karg.Batch;
+        const index_t g_idx = blockIdx.z % karg.Batch;
+        const index_t k_idx = blockIdx.z / karg.Batch;
 
-    const auto a_batch_offset       = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const auto b_batch_offset       = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const auto c_batch_offset       = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
-    const auto b_scale_batch_offset = karg.compute_ptr_offset_of_batch.GetSacleBPtrOffset(g_idx);
+        const auto a_batch_offset = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+        const auto b_batch_offset = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+        const auto c_batch_offset = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+        const auto b_scale_batch_offset =
+            karg.compute_ptr_offset_of_batch.GetSacleBPtrOffset(g_idx);
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
-
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + c_batch_offset + splitk_batch_offset.c_reduce_offset,
-        karg.p_b_scale_grid + b_scale_batch_offset + splitk_batch_offset.scale_k_split_offset,
-        p_shared_0,
-        p_shared_1,
-        karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
 
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + c_batch_offset + splitk_batch_offset.c_reduce_offset,
+            karg.p_b_scale_grid + b_scale_batch_offset + splitk_batch_offset.scale_k_split_offset,
+            p_shared_0,
+            p_shared_1,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -171,8 +177,13 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                                        BElementwiseOperation,
                                        CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_v3<
         ALayout,
         BLayout,
         CLayout,
@@ -196,7 +207,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -223,6 +234,8 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
         ComputeTypeB,
         PermuteA,
         PermuteB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     static constexpr index_t APackedSize = []() {
         if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
@@ -277,31 +290,32 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
         index_t BatchStrideScaleB_;
     };
 
-    struct Argument : public GridwiseGemm::Argument
+    template <typename GridwiseGemm>
+    struct ArgumentBase : public GridwiseGemm::Argument
     {
         index_t Batch;
         ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch;
 
-        Argument(const ADataType* p_a_grid_,
-                 const BDataType* p_b_grid_,
-                 CDataType* p_c_grid_,
-                 index_t M_,
-                 index_t N_,
-                 index_t K_,
-                 index_t StrideA_,
-                 index_t StrideB_,
-                 index_t StrideC_,
-                 index_t StrideScaleB_,
-                 index_t BatchStrideA_,
-                 index_t BatchStrideB_,
-                 index_t BatchStrideC_,
-                 index_t BatchStrideScaleB_,
-                 const BScaleDataType* p_b_scale_grid_,
-                 index_t Batch_,
-                 index_t KBatch_,
-                 AElementwiseOperation a_element_op_,
-                 BElementwiseOperation b_element_op_,
-                 CElementwiseOperation c_element_op_)
+        ArgumentBase(const ADataType* p_a_grid_,
+                     const BDataType* p_b_grid_,
+                     CDataType* p_c_grid_,
+                     index_t M_,
+                     index_t N_,
+                     index_t K_,
+                     index_t StrideA_,
+                     index_t StrideB_,
+                     index_t StrideC_,
+                     index_t StrideScaleB_,
+                     index_t BatchStrideA_,
+                     index_t BatchStrideB_,
+                     index_t BatchStrideC_,
+                     index_t BatchStrideScaleB_,
+                     const BScaleDataType* p_b_scale_grid_,
+                     index_t Batch_,
+                     index_t KBatch_,
+                     AElementwiseOperation a_element_op_,
+                     BElementwiseOperation b_element_op_,
+                     CElementwiseOperation c_element_op_)
             : GridwiseGemm::Argument(p_a_grid_,
                                      p_b_grid_,
                                      p_c_grid_,
@@ -323,12 +337,16 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
         {
         }
     };
+    using Argument = ArgumentBase<GridwiseGemm64>;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const ArgumentBase<GridwiseGemm>& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
+            using DeviceArgument = ArgumentBase<GridwiseGemm>;
             if(stream_config.log_level_ > 0)
             {
                 arg.Print();
@@ -353,7 +371,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
-                    Argument arg_ = arg;
+                    DeviceArgument arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -365,7 +383,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                     auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
                                          sizeof(BDataType) / BPackedSize;
 
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                    ck::utility::RotatingMemWrapper<DeviceArgument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
                     rotating_mem.Print();
 
@@ -422,7 +440,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                     {
                         const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                             GridwiseGemm,
-                            Argument,
+                            DeviceArgument,
                             true,
                             InMemoryDataOperationEnum::AtomicAdd,
                             minimum_occupancy>;
@@ -432,7 +450,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                     {
                         const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                             GridwiseGemm,
-                            Argument,
+                            DeviceArgument,
                             true,
                             InMemoryDataOperationEnum::Set,
                             minimum_occupancy>;
@@ -448,7 +466,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -460,7 +478,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -474,7 +492,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -490,7 +508,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -506,7 +524,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -522,7 +540,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -537,7 +555,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -553,7 +571,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::AtomicAdd,
                                     minimum_occupancy,
@@ -568,7 +586,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -580,7 +598,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -594,7 +612,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -610,7 +628,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -626,7 +644,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -642,7 +660,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -657,7 +675,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -673,7 +691,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                             {
                                 const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                     GridwiseGemm,
-                                    Argument,
+                                    DeviceArgument,
                                     true,
                                     InMemoryDataOperationEnum::Set,
                                     minimum_occupancy,
@@ -692,7 +710,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -703,7 +721,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -717,7 +735,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -728,7 +746,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -745,7 +763,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -756,7 +774,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::AtomicAdd,
                                 minimum_occupancy,
@@ -770,7 +788,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -781,7 +799,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                         {
                             const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                                 GridwiseGemm,
-                                Argument,
+                                DeviceArgument,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -800,7 +818,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                     {
                         const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                             GridwiseGemm,
-                            Argument,
+                            DeviceArgument,
                             false,
                             InMemoryDataOperationEnum::AtomicAdd,
                             minimum_occupancy>;
@@ -810,7 +828,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                     {
                         const auto kernel = kernel_batched_gemm_b_scale_xdl_cshuffle_v3<
                             GridwiseGemm,
-                            Argument,
+                            DeviceArgument,
                             false,
                             InMemoryDataOperationEnum::Set,
                             minimum_occupancy>;
@@ -822,6 +840,27 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
             return ave_time;
         }
 
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64>(arg, stream_config);
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    using Argument32 = ArgumentBase<GridwiseGemm32>;
+                    return RunImp<GridwiseGemm32>(reinterpret_cast<const Argument32&>(arg),
+                                                  stream_config);
+                }
+            }
+            return 0;
+        }
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -838,11 +877,14 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(is_gfx11_supported() && arg.KBatch > 1)
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -855,8 +897,22 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                using Argument32 = ArgumentBase<GridwiseGemm32>;
+                return GridwiseGemm32::CheckValidity(reinterpret_cast<const Argument32&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -1003,7 +1059,7 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
index 4934993693..4aa6c18d04 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -75,6 +75,9 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
     : public DeviceCGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
 {
     using DeviceOp = DeviceCGemm_4Gemm_Xdl_CShuffle;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -118,7 +121,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
     }
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
         ALayout,
         BLayout,
         CLayout,
@@ -142,7 +146,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -164,13 +168,15 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CShuffleBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     using CGridDesc_M_N = decltype(MakeDescriptor_M_N({1, 1}, {1, 1}));
 
     // Argument
-    struct Argument : public tensor_operation::device::BaseArgument, public GridwiseGemm::Problem
+    struct Argument : public tensor_operation::device::BaseArgument, public GridwiseGemm64::Problem
     {
-        using Problem = typename GridwiseGemm::Problem;
+        using Problem = typename GridwiseGemm64::Problem;
 
         Argument(const ADataType* p_a_grid_real_,
                  const ADataType* p_a_grid_imag_,
@@ -221,14 +227,17 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
                 arg.Print();
             }
 
-            if(!GridwiseGemm::CheckValidity(arg))
+            typename GridwiseGemm::Problem problem(
+                arg.M, arg.N, arg.K, arg.StrideA, arg.StrideB, arg.StrideC);
+            if(!GridwiseGemm::CheckValidity(problem))
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
@@ -317,7 +326,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                    arg.p_a_grid_real,
                                                    arg.p_b_grid_real,
                                                    arg.p_aux_grid,
-                                                   arg);
+                                                   problem);
 
                 ave_time += launch_and_time_kernel(stream_config,
                                                    kernel,
@@ -327,7 +336,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                    arg.p_a_grid_imag,
                                                    arg.p_b_grid_imag,
                                                    arg.p_aux_2_grid,
-                                                   arg);
+                                                   problem);
 
                 // c_real = aux - aux_2
                 ave_time += launch_and_time_kernel(
@@ -352,7 +361,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                    arg.p_a_grid_real,
                                                    arg.p_b_grid_imag,
                                                    arg.p_aux_grid,
-                                                   arg);
+                                                   problem);
 
                 ave_time += launch_and_time_kernel(stream_config,
                                                    kernel,
@@ -362,7 +371,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                    arg.p_a_grid_imag,
                                                    arg.p_b_grid_real,
                                                    arg.p_aux_2_grid,
-                                                   arg);
+                                                   problem);
 
                 // c_imag = aux + aux_2
                 ave_time += launch_and_time_kernel(
@@ -395,7 +404,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                    arg.p_a_grid_real,
                                                    arg.p_b_grid_real,
                                                    arg.p_aux_grid,
-                                                   arg);
+                                                   problem);
 
                 ave_time += launch_and_time_kernel(stream_config,
                                                    kernel,
@@ -405,7 +414,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                    arg.p_a_grid_imag,
                                                    arg.p_b_grid_imag,
                                                    arg.p_aux_2_grid,
-                                                   arg);
+                                                   problem);
 
                 // c_real = aux - aux_2
                 ave_time += launch_and_time_kernel(
@@ -430,7 +439,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                    arg.p_a_grid_real,
                                                    arg.p_b_grid_imag,
                                                    arg.p_aux_grid,
-                                                   arg);
+                                                   problem);
 
                 ave_time += launch_and_time_kernel(stream_config,
                                                    kernel,
@@ -440,7 +449,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                    arg.p_a_grid_imag,
                                                    arg.p_b_grid_real,
                                                    arg.p_aux_2_grid,
-                                                   arg);
+                                                   problem);
 
                 // c_imag = aux + aux_2
                 ave_time += launch_and_time_kernel(
@@ -461,6 +470,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -477,12 +488,27 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                typename GridwiseGemm32::Problem problem(
+                    arg.M, arg.N, arg.K, arg.StrideA, arg.StrideB, arg.StrideC);
+                return GridwiseGemm32::CheckValidity(problem);
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -587,8 +613,12 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
 
     static std::size_t GetCElementSpaceSize(index_t M, index_t N, index_t StrideC)
     {
-        const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(
-            M, GridwiseGemm::CalculateMPadded(M), N, GridwiseGemm::CalculateNPadded(N), StrideC);
+        const auto c_grid_desc_m_n =
+            GridwiseGemm64::MakeCGridDescriptor_M_N(M,
+                                                    GridwiseGemm64::CalculateMPadded(M),
+                                                    N,
+                                                    GridwiseGemm64::CalculateNPadded(N),
+                                                    StrideC);
 
         return c_grid_desc_m_n.GetElementSpaceSize();
     }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
index 27f0a7af7c..cf5d5bd64e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -55,22 +55,26 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
             e_grid_desc_mblock_mperblock_nblock_nperblock,
         const Block2ETileMap block_2_etile_map)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_as_grid,
-                                                  p_bs_grid,
-                                                  p_ds_grid,
-                                                  p_e_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  as_grid_desc_ak0_m_ak1,
-                                                  bs_grid_desc_bk0_n_bk1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_as_grid,
+            p_bs_grid,
+            p_ds_grid,
+            p_e_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            as_grid_desc_ak0_m_ak1,
+            bs_grid_desc_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_etile_map);
+    }
 #else
     ignore = p_as_grid;
     ignore = p_bs_grid;
@@ -160,6 +164,10 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
 {
     using DeviceOp = DeviceContractionMultipleABD_Xdl_CShuffle;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr index_t NumATensor = AsDataType::Size();
     static constexpr index_t NumBTensor = BsDataType::Size();
     static constexpr index_t NumDTensor = DsDataType::Size();
@@ -172,7 +180,8 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
     using ComputeDataType = EDataType;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleABD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleABD_xdl_cshuffle<
         AsDataType,
         BsDataType,
         ComputeDataType,
@@ -194,7 +203,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -217,6 +226,8 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched,
         PipelineVer>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     static constexpr auto matrix_padder =
         ck::tensor_operation::device::MatrixPadder<GemmSpec, index_t, index_t, index_t>{
@@ -385,21 +396,21 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
 
     // desc for blockwise copy
     using AsGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAsGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAsGridDescriptor_AK0_M_AK1(
             AsGridDesc_M_K{}))>;
     using BsGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBsGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBsGridDescriptor_BK0_N_BK1(
             BsGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     // block-to-e-tile map
     using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -427,11 +438,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
               bs_grid_desc_n_k_{},
               ds_grid_desc_m_n_{},
               e_grid_desc_m_n_{MakeEGridDescriptor_M_N(e_ms_ns_length, e_ms_ns_stride)},
-              as_grid_desc_ak0_m_ak1_{},
-              bs_grid_desc_bk0_n_bk1_{},
-              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op}
@@ -475,28 +482,6 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
                     MakeEGridDescriptor_M_N(d_ms_ns_lengths[i], d_ms_ns_strides[i]);
             });
 
-            // populate desc for Ds/E
-            if(GridwiseGemm::CheckValidity(as_grid_desc_m_k_,
-                                           bs_grid_desc_n_k_,
-                                           ds_grid_desc_m_n_,
-                                           e_grid_desc_m_n_,
-                                           block_2_etile_map_))
-            {
-                as_grid_desc_ak0_m_ak1_ =
-                    GridwiseGemm::MakeDefaultAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k_);
-
-                bs_grid_desc_bk0_n_bk1_ =
-                    GridwiseGemm::MakeDefaultBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k_);
-
-                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        ds_grid_desc_m_n_);
-
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-            }
-
             // for sanity check of vector memory access
             for(index_t i = 0; i < NumATensor; ++i)
             {
@@ -521,9 +506,9 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
         }
 
         // pointers
-        typename GridwiseGemm::AsGridPointer p_as_grid_;
-        typename GridwiseGemm::BsGridPointer p_bs_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::AsGridPointer p_as_grid_;
+        typename GridwiseGemm64::BsGridPointer p_bs_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptors for problem definiton
@@ -532,13 +517,6 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
         DsGridDesc_M_N ds_grid_desc_m_n_;
         EGridDesc_M_N e_grid_desc_m_n_;
 
-        // tensor descriptors for block/thread-wise copy
-        AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1_;
-        BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1_;
-        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
-
         // block-to-e-tile map
         Block2ETileMap block_2_etile_map_;
 
@@ -564,7 +542,8 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.as_grid_desc_m_k_,
                                             arg.bs_grid_desc_n_k_,
@@ -574,7 +553,19 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
+            auto as_grid_desc_ak0_m_ak1 =
+                GridwiseGemm::MakeDefaultAsGridDescriptor_AK0_M_AK1(arg.as_grid_desc_m_k_);
 
+            auto bs_grid_desc_bk0_n_bk1 =
+                GridwiseGemm::MakeDefaultBsGridDescriptor_BK0_N_BK1(arg.bs_grid_desc_n_k_);
+
+            auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.ds_grid_desc_m_n_);
+
+            auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.e_grid_desc_m_n_);
             const index_t grid_size =
                 arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
 
@@ -609,10 +600,10 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
                                               arg.a_element_op_,
                                               arg.b_element_op_,
                                               arg.cde_element_op_,
-                                              arg.as_grid_desc_ak0_m_ak1_,
-                                              arg.bs_grid_desc_bk0_n_bk1_,
-                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              as_grid_desc_ak0_m_ak1,
+                                              bs_grid_desc_bk0_n_bk1,
+                                              ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
                                               arg.block_2_etile_map_);
             };
 
@@ -628,6 +619,8 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
             }
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -638,11 +631,12 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        using A0DataType = remove_cvref_t<tuple_element_t<0, AsDataType>>;
+        using B0DataType = remove_cvref_t<tuple_element_t<0, BsDataType>>;
+        if(!ck::is_xdl_wmma_supported<A0DataType, B0DataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         // check vector load/store
         {
             bool valid_as_access = true;
@@ -713,11 +707,30 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
             }
         }
 
-        return GridwiseGemm::CheckValidity(arg.as_grid_desc_m_k_,
-                                           arg.bs_grid_desc_n_k_,
-                                           arg.ds_grid_desc_m_n_,
-                                           arg.e_grid_desc_m_n_,
-                                           arg.block_2_etile_map_);
+        if(get_warp_size() > 0)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.as_grid_desc_m_k_,
+                                                     arg.bs_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.as_grid_desc_m_k_,
+                                                     arg.bs_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index 615566a555..2a569a49e9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -53,23 +53,26 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
             e_grid_desc_mblock_mperblock_nblock_nperblock,
         const Block2ETileMap block_2_etile_map)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-        p_a_grid,
-        p_b_grid,
-        p_ds_grid,
-        p_e_grid,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        cde_element_op,
-        a_grid_desc_ak0_m_ak1,
-        b_grid_desc_bk0_n_bk1,
-        ds_grid_desc_mblock_mperblock_nblock_nperblock,
-        e_grid_desc_mblock_mperblock_nblock_nperblock,
-        block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+            p_a_grid,
+            p_b_grid,
+            p_ds_grid,
+            p_e_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_etile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -163,6 +166,10 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
 {
     using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     static constexpr auto I0 = Number<0>{};
@@ -314,7 +321,8 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
     using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N({}, {}));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle<
         ADataType, // TODO: distinguish A/B datatype
         BDataType,
         ComputeDataType,
@@ -335,7 +343,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -357,24 +365,26 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // desc for blockwise copy
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     // block-to-e-tile map
     using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -403,12 +413,10 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
               ds_grid_desc_m_n_{},
               e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_ms_ns_lengths, e_ms_ns_strides)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
-              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op}
@@ -425,22 +433,6 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                     DeviceOp::MakeEGridDescriptor_M_N(ds_ms_ns_lengths[i], ds_ms_ns_strides[i]);
             });
 
-            // populate desc for Ds/E
-            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                           b_grid_desc_n_k_,
-                                           ds_grid_desc_m_n_,
-                                           e_grid_desc_m_n_,
-                                           block_2_etile_map_))
-            {
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-
-                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        ds_grid_desc_m_n_);
-            }
-
             // for sanity check of vector memory access
             tie(a_continous_dim_, a_max_read_elems_) =
                 CalculateMaxRead<NumDimM, NumDimK>(a_ms_ks_lengths, a_ms_ks_strides);
@@ -471,7 +463,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptors for problem definiton
@@ -512,7 +504,8 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                             arg.b_grid_desc_n_k_,
@@ -523,7 +516,13 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                 throw std::runtime_error(
                     "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
             }
+            auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.e_grid_desc_m_n_);
 
+            auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.ds_grid_desc_m_n_);
             const index_t grid_size =
                 arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
 
@@ -562,8 +561,8 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                                               arg.cde_element_op_,
                                               arg.a_grid_desc_ak0_m_ak1_,
                                               arg.b_grid_desc_bk0_n_bk1_,
-                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
                                               arg.block_2_etile_map_);
             };
 
@@ -577,6 +576,8 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
             }
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -587,21 +588,39 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeDataType, ComputeDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if(!ck::is_lds_direct_load_supported() && std::is_same<ADataType, double>::value)
         {
             return false;
         }
+        bool valid = false;
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                valid = GridwiseGemm64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                      arg.b_grid_desc_n_k_,
+                                                      arg.ds_grid_desc_m_n_,
+                                                      arg.e_grid_desc_m_n_,
+                                                      arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                valid = GridwiseGemm32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                      arg.b_grid_desc_n_k_,
+                                                      arg.ds_grid_desc_m_n_,
+                                                      arg.e_grid_desc_m_n_,
+                                                      arg.block_2_etile_map_);
+            }
+        }
 
-        if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                        arg.b_grid_desc_n_k_,
-                                        arg.ds_grid_desc_m_n_,
-                                        arg.e_grid_desc_m_n_,
-                                        arg.block_2_etile_map_))
+        if(!valid)
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 23440e24f6..ff652ebefb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -72,6 +72,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
     using DeviceOp =
         DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     using ADataType = OutDataType;
     using BDataType = InDataType;
     using CDataType = WeiDataType;
@@ -281,7 +285,8 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -300,7 +305,7 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
         NPerXdl,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -329,8 +334,11 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
         CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         true,
         true>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using GridwiseGemmAtomicAdd = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmAtomicAddBase = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -349,7 +357,7 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
         NPerXdl,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -378,6 +386,9 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
         CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         true,
         true>;
+    using GridwiseGemmAtomicAdd64 = GridwiseGemmAtomicAddBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemmAtomicAdd32 = GridwiseGemmAtomicAddBase<NXdlPerWave32>;
+
     // Argument
     using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
         decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
@@ -506,7 +517,9 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                       << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -635,6 +648,8 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -650,11 +665,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         // vector load A/B matrix from global memory
         if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 2 &&
              arg.Conv_K_ % ABlockTransferSrcScalarPerVector == 0 &&
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index d4f89b3e09..2be31cabed 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -35,8 +35,8 @@ template <typename InDataType,
           ck::index_t NPerBlock,
           ck::index_t K0PerBlock,
           ck::index_t K1,
-          ck::index_t MPerXdl,
-          ck::index_t NPerXdl,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
           ck::index_t MXdlPerWave,
           ck::index_t NXdlPerWave,
           typename ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -69,6 +69,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
 {
     using DeviceOp = DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     using ADataType = OutDataType;
     using BDataType = WeiDataType;
     using CDataType = InDataType;
@@ -374,7 +378,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
         BlockSize,
         ABDataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -386,11 +391,11 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         MPerBlock,
         NPerBlock,
         K0PerBlock,
-        MPerXdl,
-        NPerXdl,
+        MPerXDL,
+        NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -410,6 +415,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
         7,                                // CThreadTransferSrcDstVectorDim,
         CThreadTransferDstScalarPerVector>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -512,7 +519,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
             for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -603,6 +611,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -618,11 +628,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if constexpr(ConvBackwardDataSpecialization ==
                      ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
         {
@@ -651,14 +660,30 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         }
 
         // Gridwise GEMM size
+        bool isWave64 = get_warp_size() == 64;
         for(std::size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
         {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
-                                            arg.b_grid_desc_k0_n_k1_container_[i],
-                                            arg.c_grid_desc_m_n_container_[i]))
+            bool valid = false;
+            if(isWave64)
             {
-                return false;
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    valid = GridwiseGemm64::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                                          arg.b_grid_desc_k0_n_k1_container_[i],
+                                                          arg.c_grid_desc_m_n_container_[i]);
+                }
             }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    valid = GridwiseGemm32::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                                          arg.b_grid_desc_k0_n_k1_container_[i],
+                                                          arg.c_grid_desc_m_n_container_[i]);
+                }
+            }
+            if(!valid)
+                return false;
         }
         return true;
     }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index 5d039427d6..94a0dc8c84 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -68,6 +68,10 @@ struct
     using DeviceOp =
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     using ADataType = InDataType;
     using BDataType = WeiDataType;
     using CDataType = OutDataType;
@@ -467,7 +471,8 @@ struct
     using Block2CTileMap = BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, CGridDesc_M_N>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3<
         BlockSize,
         ABDataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -488,7 +493,7 @@ struct
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
         Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
@@ -509,6 +514,8 @@ struct
         CShuffleNXdlPerWavePerShuffle,
         CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
         CBlockTransferScalarPerVector_NWaveNPerXdl>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -541,9 +548,6 @@ struct
               c_grid_desc_m_n_{},
               c0_grid_desc_m_n_{},
               c1_grid_desc_m_n_{},
-              c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
-              c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
-              c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
               block_2_ctile_map_{},
               in_element_op_{in_element_op},
               wei_element_op_{wei_element_op},
@@ -578,27 +582,6 @@ struct
             c1_grid_desc_m_n_    = descs[I4];
 
             block_2_ctile_map_ = Block2CTileMap{c_grid_desc_m_n_};
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
-                                           b_grid_desc_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
-                    GridwiseGemm::
-                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
-                            c_grid_desc_m_n_);
-
-                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
-                    GridwiseGemm::
-                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
-                            c0_grid_desc_m_n_);
-
-                c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
-                    GridwiseGemm::
-                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
-                            c1_grid_desc_m_n_);
-            }
         }
 
         //  private:
@@ -612,15 +595,7 @@ struct
         CGridDesc_M_N c_grid_desc_m_n_;
         C0GridDesc_M_N c0_grid_desc_m_n_;
         C1GridDesc_M_N c1_grid_desc_m_n_;
-        typename GridwiseGemm::
-            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
-        typename GridwiseGemm::
-            C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
-        typename GridwiseGemm::
-            C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
+
         Block2CTileMap block_2_ctile_map_;
         InElementwiseOperation in_element_op_;
         WeiElementwiseOperation wei_element_op_;
@@ -643,7 +618,8 @@ struct
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
@@ -700,6 +676,20 @@ struct
 
             float ave_time = 0;
 
+            auto c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                GridwiseGemm::
+                    MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                        arg.c_grid_desc_m_n_);
+
+            auto c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                GridwiseGemm::
+                    MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                        arg.c0_grid_desc_m_n_);
+
+            auto c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                GridwiseGemm::
+                    MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                        arg.c1_grid_desc_m_n_);
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
                 const auto kernel = kernel_gemm_xdlops_v3r3<
@@ -736,9 +726,9 @@ struct
                     arg.p_c1_grid_,
                     arg.a_grid_desc_k0_m_k1_,
                     arg.b_grid_desc_k0_n_k1_,
-                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
-                    arg.c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
-                    arg.c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                    c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                    c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
                     arg.in_element_op_,
                     arg.wei_element_op_,
                     arg.out_element_op_,
@@ -780,9 +770,9 @@ struct
                     arg.p_c1_grid_,
                     arg.a_grid_desc_k0_m_k1_,
                     arg.b_grid_desc_k0_n_k1_,
-                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
-                    arg.c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
-                    arg.c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                    c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                    c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
                     arg.in_element_op_,
                     arg.wei_element_op_,
                     arg.out_element_op_,
@@ -792,6 +782,8 @@ struct
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -807,11 +799,10 @@ struct
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if constexpr(ConvForwardSpecialization ==
                      ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
         {
@@ -851,10 +842,27 @@ struct
         }
 
         // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                                     arg.b_grid_desc_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                                     arg.b_grid_desc_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index 242f5cd673..3a3b29a168 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -69,6 +69,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
     using DeviceOp =
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     using ADataType = InDataType;
     using BDataType = WeiDataType;
     using CDataType = OutDataType;
@@ -448,7 +452,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
     using C0GridDesc_M_N    = remove_cvref_t<decltype(ABCGridDescs{}[I3])>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2<
         BlockSize,
         ABDataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -468,7 +473,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
         Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
@@ -489,6 +494,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
         CShuffleNXdlPerWavePerShuffle,
         CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
         CBlockTransferScalarPerVector_NWaveNPerXdl>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -520,8 +527,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
               b_grid_desc_k0_n_k1_{},
               c_grid_desc_m_n_{},
               c0_grid_desc_m_n_{},
-              c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
-              c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
               block_2_ctile_map_{},
               M01_{M01},
               N01_{N01},
@@ -556,23 +561,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
             c_grid_desc_m_n_     = descs[I2];
             c0_grid_desc_m_n_    = descs[I3];
             block_2_ctile_map_ =
-                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
-                                           b_grid_desc_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
-                    GridwiseGemm::
-                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
-                            c_grid_desc_m_n_);
-
-                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
-                    GridwiseGemm::
-                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
-                            c0_grid_desc_m_n_);
-            }
+                GridwiseGemm64::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
         }
 
         const ADataType* p_a_grid_;
@@ -583,13 +572,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
         BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
         CGridDesc_M_N c_grid_desc_m_n_;
         C0GridDesc_M_N c0_grid_desc_m_n_;
-        typename GridwiseGemm::
-            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
-        typename GridwiseGemm::
-            C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        typename GridwiseGemm64::DefaultBlock2CTileMap block_2_ctile_map_;
         index_t M01_;
         index_t N01_;
         InElementwiseOperation in_element_op_;
@@ -613,7 +596,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
@@ -667,6 +651,15 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
 
             float ave_time = 0;
 
+            auto c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                GridwiseGemm::
+                    MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                        arg.c_grid_desc_m_n_);
+
+            auto c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                GridwiseGemm::
+                    MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                        arg.c0_grid_desc_m_n_);
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
                 const auto kernel = kernel_gemm_xdlops_v3r2<
@@ -699,8 +692,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
                     arg.p_c0_grid_,
                     arg.a_grid_desc_k0_m_k1_,
                     arg.b_grid_desc_k0_n_k1_,
-                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
-                    arg.c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                    c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
                     arg.in_element_op_,
                     arg.wei_element_op_,
                     arg.out_element_op_,
@@ -738,8 +731,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
                     arg.p_c0_grid_,
                     arg.a_grid_desc_k0_m_k1_,
                     arg.b_grid_desc_k0_n_k1_,
-                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
-                    arg.c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                    c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
                     arg.in_element_op_,
                     arg.wei_element_op_,
                     arg.out_element_op_,
@@ -749,6 +742,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -764,11 +759,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if constexpr(ConvForwardSpecialization ==
                      ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
         {
@@ -808,10 +802,27 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
         }
 
         // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                                     arg.b_grid_desc_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                                     arg.b_grid_desc_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 0d295a2418..3bc5f9af03 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -36,8 +36,8 @@ template <
     ck::index_t NPerBlock,
     ck::index_t K0PerBlock,
     ck::index_t K1,
-    ck::index_t MPerXdl,
-    ck::index_t NPerXdl,
+    ck::index_t MPerXDL,
+    ck::index_t NPerXDL,
     ck::index_t MXdlPerWave,
     ck::index_t NXdlPerWave,
     typename ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -72,6 +72,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
 {
     using DeviceOp = DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     using ADataType = InDataType;
     using BDataType = WeiDataType;
     using CDataType = OutDataType;
@@ -433,7 +437,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
     using Block2CTileMap = BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, CGridDesc_M_N>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1<
         BlockSize,
         ABDataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -451,10 +456,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
         K0PerBlock * K1,
         K1, // AK1
         K1, // BK1
-        MPerXdl,
-        NPerXdl,
+        MPerXDL,
+        NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
         Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
@@ -475,6 +480,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
         CShuffleNXdlPerWavePerShuffle,
         CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
         CBlockTransferScalarPerVector_NWaveNPerXdl>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<math::max(NXdlPerWave32, 1)>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -501,7 +508,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
               a_grid_desc_k0_m_k1_{},
               b_grid_desc_k0_n_k1_{},
               c_grid_desc_m_n_{},
-              c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
               block_2_ctile_map_{},
               in_element_op_{in_element_op},
               wei_element_op_{wei_element_op},
@@ -534,28 +540,13 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
             c_grid_desc_m_n_     = descs[I2];
 
             block_2_ctile_map_ = Block2CTileMap{c_grid_desc_m_n_};
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
-                                           b_grid_desc_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
-                    GridwiseGemm::
-                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
-                            c_grid_desc_m_n_);
-            }
         }
-
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
         CDataType* p_c_grid_;
         AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
         BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
         CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::
-            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
         Block2CTileMap block_2_ctile_map_;
         InElementwiseOperation in_element_op_;
         WeiElementwiseOperation wei_element_op_;
@@ -578,8 +569,21 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+            }
+            auto c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                GridwiseGemm::
+                    MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                        arg.c_grid_desc_m_n_);
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
                 std::cout << DeviceOp{}.GetTypeString() << std::endl;
@@ -614,35 +618,25 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                 std::cout
                     << "arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_"
                        "nwavenperxdl_{ "
-                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                    << c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                            .GetLength(I0)
                     << ", "
-                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                    << c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                            .GetLength(I1)
                     << ", "
-                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                    << c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                            .GetLength(I2)
                     << ", "
-                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                    << c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                            .GetLength(I3)
                     << ", "
-                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                    << c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                            .GetLength(I4)
                     << ", "
-                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                    << c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
                            .GetLength(I5)
                     << "}" << std::endl;
             }
-
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                            arg.b_grid_desc_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
-            {
-                throw std::runtime_error(
-                    "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
-            }
-
             const index_t grid_size =
                 arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
 
@@ -679,7 +673,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                     arg.p_c_grid_,
                     arg.a_grid_desc_k0_m_k1_,
                     arg.b_grid_desc_k0_n_k1_,
-                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
                     arg.in_element_op_,
                     arg.wei_element_op_,
                     arg.out_element_op_,
@@ -713,7 +707,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                     arg.p_c_grid_,
                     arg.a_grid_desc_k0_m_k1_,
                     arg.b_grid_desc_k0_n_k1_,
-                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
                     arg.in_element_op_,
                     arg.wei_element_op_,
                     arg.out_element_op_,
@@ -723,6 +717,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -738,11 +734,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if constexpr(ConvForwardSpecialization ==
                      ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
         {
@@ -782,10 +777,27 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
         }
 
         // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                                     arg.b_grid_desc_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                                     arg.b_grid_desc_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index c7aa54f1d9..cecfa48408 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -69,6 +69,10 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
 {
     using DeviceOp = DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     using ADataType = InDataType;
     using BDataType = WeiDataType;
     using CDataType = OutDataType;
@@ -324,7 +328,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
         BlockSize,
         ABDataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -340,7 +345,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
         Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
@@ -360,6 +365,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
         7,                                // CThreadTransferSrcDstVectorDim,
         CThreadTransferDstScalarPerVector>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -430,7 +437,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
@@ -510,6 +518,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -525,11 +535,10 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if constexpr(ConvForwardSpecialization ==
                      ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
         {
@@ -569,8 +578,23 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         }
 
         // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(
-            arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(
+                    arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index dc8499fcf2..09dc2a03db 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -56,32 +56,35 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CElementwiseOperation c_element_op,
         const Block2CTileMap block_2_ctile_map)
 {
-#if defined(__gfx9__)
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset =
-        __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(a_batch_stride) * g_idx);
-    const long_index_t b_batch_offset =
-        __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(b_batch_stride) * g_idx);
-    const long_index_t c_batch_offset =
-        __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(c_batch_stride) * g_idx);
+        const long_index_t a_batch_offset =
+            __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(a_batch_stride) * g_idx);
+        const long_index_t b_batch_offset =
+            __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(b_batch_stride) * g_idx);
+        const long_index_t c_batch_offset =
+            __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(c_batch_stride) * g_idx);
 
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_c_grid + c_batch_offset,
-                                                  p_shared,
-                                                  a_grid_desc_k0_m_k1,
-                                                  b_grid_desc_k0_n_k1,
-                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  block_2_ctile_map);
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                      p_b_grid + b_batch_offset,
+                                                      p_c_grid + c_batch_offset,
+                                                      p_shared,
+                                                      a_grid_desc_k0_m_k1,
+                                                      b_grid_desc_k0_n_k1,
+                                                      c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op,
+                                                      block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -140,6 +143,10 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
 {
     using DeviceOp = DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     using ADataType = InDataType;
     using BDataType = WeiDataType;
     using CDataType = OutDataType;
@@ -263,7 +270,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
     using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
         BlockSize,
         InDataType,
         AccDataType,
@@ -282,7 +290,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
         Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
@@ -302,6 +310,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
         Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
         7,
         CThreadTransferDstScalarPerVector>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
         decltype(GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
@@ -399,7 +409,9 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
@@ -507,6 +519,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -523,11 +537,10 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
                                            arg.b_grid_desc_k0_n_k1_,
                                            arg.c_grid_desc_m_n_,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
index 2881036bee..d074342127 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
@@ -36,8 +36,8 @@ template <ck::index_t NDimSpatial,
           ck::index_t NPerBlock,
           ck::index_t K0PerBlock,
           ck::index_t K1,
-          ck::index_t MPerXdl,
-          ck::index_t NPerXdl,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
           ck::index_t MXdlPerWave,
           ck::index_t NXdlPerWave,
           typename ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -80,6 +80,10 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
 {
     using DeviceOp = DeviceConvNdBwdDataNwcKxcNwk_Xdl;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     using ADataType = OutDataType;
     using BDataType = WeiDataType;
     using CDataType = InDataType;
@@ -975,7 +979,8 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
         BlockSize,
         ABDataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -987,11 +992,11 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         MPerBlock,
         NPerBlock,
         K0PerBlock,
-        MPerXdl,
-        NPerXdl,
+        MPerXDL,
+        NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -1011,6 +1016,8 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
         7,                                // CThreadTransferSrcDstVectorDim,
         CThreadTransferDstScalarPerVector>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -1216,7 +1223,8 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
             for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -1305,6 +1313,8 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -1320,11 +1330,10 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if constexpr(ConvBackwardDataSpecialization ==
                      ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
         {
@@ -1354,14 +1363,30 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         }
 
         // Gridwise GEMM size
+        bool isWave64 = get_warp_size() == 64;
         for(std::size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
         {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
-                                            arg.b_grid_desc_k0_n_k1_container_[i],
-                                            arg.c_grid_desc_m_n_container_[i]))
+            bool valid = false;
+            if(isWave64)
             {
-                return false;
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    valid = GridwiseGemm64::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                                          arg.b_grid_desc_k0_n_k1_container_[i],
+                                                          arg.c_grid_desc_m_n_container_[i]);
+                }
             }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    valid = GridwiseGemm32::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                                          arg.b_grid_desc_k0_n_k1_container_[i],
+                                                          arg.c_grid_desc_m_n_container_[i]);
+                }
+            }
+            if(!valid)
+                return false;
         }
         return true;
     }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
index 5bfef60af2..3929525987 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -81,6 +81,10 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
 {
     using DeviceOp = DeviceGemmBiasAddReduce_Xdl_CShuffle;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -383,7 +387,8 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
     using ReduceGridDesc_M    = decltype(MakeReduceGridDescriptor_M(1));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -417,7 +422,7 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -442,6 +447,8 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
         CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
         CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -477,11 +484,7 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
               c0_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, 0)},
               c1_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC1)},
               reduce_grid_desc_m_{DeviceOp::MakeReduceGridDescriptor_M(MRaw)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              c0_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              c1_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              reduce_grid_desc_mblock_mperblock_{},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              block_2_ctile_map_{GridwiseGemm64::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               c_element_op_{c_element_op},
@@ -489,26 +492,6 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
               reduce_in_element_ops_{reduce_in_element_ops},
               reduce_out_element_ops_{reduce_out_element_ops}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-
-                c0_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c0_grid_desc_m_n_);
-
-                c1_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c1_grid_desc_m_n_);
-
-                reduce_grid_desc_mblock_mperblock_ =
-                    GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(reduce_grid_desc_m_);
-            }
         }
 
         //  private:
@@ -524,15 +507,7 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
         C0GridDesc_M_N c0_grid_desc_m_n_;
         C1GridDesc_M_N c1_grid_desc_m_n_;
         ReduceGridDesc_M reduce_grid_desc_m_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c0_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c1_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock
-            reduce_grid_desc_mblock_mperblock_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        typename GridwiseGemm64::DefaultBlock2CTileMap block_2_ctile_map_;
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
         CElementwiseOperation c_element_op_;
@@ -546,7 +521,8 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
                                             arg.b_grid_desc_bk0_n_bk1_,
@@ -555,7 +531,20 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
+            auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c_grid_desc_m_n_);
 
+            auto c0_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c0_grid_desc_m_n_);
+
+            auto c1_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c1_grid_desc_m_n_);
+
+            auto reduce_grid_desc_mblock_mperblock =
+                GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(arg.reduce_grid_desc_m_);
             const index_t grid_size =
                 arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
 
@@ -607,10 +596,10 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
                                            arg.reduce_out_element_ops_,
                                            arg.a_grid_desc_ak0_m_ak1_,
                                            arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.c0_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.c1_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                           c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                                           c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                                           reduce_grid_desc_mblock_mperblock,
                                            arg.block_2_ctile_map_);
             }
             else
@@ -657,16 +646,18 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
                                            arg.reduce_out_element_ops_,
                                            arg.a_grid_desc_ak0_m_ak1_,
                                            arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.c0_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.c1_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                           c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                                           c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                                           reduce_grid_desc_mblock_mperblock,
                                            arg.block_2_ctile_map_);
             }
 
             return elapsed_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -683,15 +674,31 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceO
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp
new file mode 100644
index 0000000000..48914479bc
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/// @brief \"Universal\" GEMM operation with SplitK support and multiple D tensors.
+///
+/// @par Overview
+///         This GEMM operation implements the following mathematical equation:
+///         E{M,N} = CDE_op(A_op(As{M,K}...) * B_op(Bs{K,N}...), Ds{M,N}...)
+///         Where As, Bs, Ds are input tensors and E is the output tensor. The A/B_op are
+///         elementwise
+//          operations that could be applied on each tensor respectively. The CDE_op is an
+//          elementwise operation applied to the C and all D tensors.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam AsLayout    A tensors data layouts.
+/// @tparam BsLayout    B tensors data layouts.
+/// @tparam DsLayout    D tensors data layouts.
+/// @tparam ELayout     E tensor data layout.
+/// @tparam AsDataType  A tensors data types.
+/// @tparam BsDataType  B tensors data types.
+/// @tparam DsDataType  D tensors data types.
+/// @tparam EDataType   E tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CDEElementwiseOperation Elementwise operation applied to the C output tensor (after
+///                                 GEMM) and D input tensors.
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerWmma    M size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam NPerWmma    N size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam MRepeat     The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NRepeat     The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CDEShuffleBlockTransferScalarPerVectors The size of vectorized memory access.
+///                                         Used when loading data from D tensors and storing data
+///                                         to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = EDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceGemmMultipleABD_Wmma_CShuffleV3
+    : public DeviceGemmMultipleABDSplitK<AsLayout,
+                                         BsLayout,
+                                         DsLayout,
+                                         ELayout,
+                                         AsDataType,
+                                         BsDataType,
+                                         DsDataType,
+                                         EDataType,
+                                         AElementwiseOperation,
+                                         BElementwiseOperation,
+                                         CDEElementwiseOperation>
+{
+    // Note: Pass multiple layout but then using only the first one
+    // This is to replicate xdl functionality but it should be extended
+    using ALayout = remove_cvref_t<tuple_element_t<0, AsLayout>>;
+    using BLayout = remove_cvref_t<tuple_element_t<0, BsLayout>>;
+
+    using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3<
+        ALayout,
+        BLayout,
+        DsLayout,
+        ELayout,
+        AsDataType,
+        BsDataType,
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    using DeviceGemmCommon =
+        DeviceGemm_Wmma_CShuffleV3_Common<GridwiseGemm,
+                                          AsDataType,
+                                          BsDataType,
+                                          DsDataType,
+                                          EDataType,
+                                          MPerBlock,
+                                          NPerBlock,
+                                          KPerBlock,
+                                          BlockSize,
+                                          AK1,
+                                          BK1,
+                                          GemmSpec,
+                                          CDEShuffleBlockTransferScalarPerVectors,
+                                          BlkGemmPipeSched,
+                                          BlkGemmPipelineVer,
+                                          ComputeTypeA,
+                                          ComputeTypeB>;
+
+    // Invoker
+    using Invoker = typename DeviceGemmCommon::Invoker;
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return DeviceGemmCommon::IsSupportedArgument(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(std::array<const void*, GridwiseGemm::NumATensor> p_as,
+                             std::array<const void*, GridwiseGemm::NumBTensor> p_bs,
+                             std::array<const void*, GridwiseGemm::NumDTensor> p_ds,
+                             void* p_e,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             std::array<ck::index_t, GridwiseGemm::NumATensor> StrideAs,
+                             std::array<ck::index_t, GridwiseGemm::NumBTensor> StrideBs,
+                             std::array<index_t, GridwiseGemm::NumDTensor> StrideDs,
+                             index_t StrideE,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_as,
+                        p_bs,
+                        p_ds,
+                        static_cast<EDataType*>(p_e),
+                        M,
+                        N,
+                        K,
+                        StrideAs,
+                        StrideBs,
+                        StrideDs,
+                        StrideE,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, GridwiseGemm::NumATensor> p_as,
+                        std::array<const void*, GridwiseGemm::NumBTensor> p_bs,
+                        std::array<const void*, GridwiseGemm::NumDTensor> p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        std::array<ck::index_t, GridwiseGemm::NumATensor> StrideAs,
+                        std::array<ck::index_t, GridwiseGemm::NumBTensor> StrideBs,
+                        std::array<ck::index_t, GridwiseGemm::NumDTensor> StrideDs,
+                        index_t StrideE,
+                        index_t KBatch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_as,
+                                          p_bs,
+                                          p_ds,
+                                          static_cast<EDataType*>(p_e),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideAs,
+                                          StrideBs,
+                                          StrideDs,
+                                          StrideE,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemmMultipleABD_Wmma_CShuffleV3"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", ";
+        static_for<0, GridwiseGemm::NumATensor, 1>{}([&](auto i) {
+            using ALayout_ = remove_cvref_t<tuple_element_t<i.value, AsLayout>>;
+
+            str << std::string(ALayout_::name)[0];
+        });
+        static_for<0, GridwiseGemm::NumBTensor, 1>{}([&](auto i) {
+            using BLayout_ = remove_cvref_t<tuple_element_t<i.value, BsLayout>>;
+
+            str << std::string(BLayout_::name)[0];
+        });
+        static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
+            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+            str << std::string(DLayout::name)[0];
+        });
+        str << std::string(ELayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock << "x" << NPerBlock << "x" << KPerBlock << ", "
+            << "WaveTile: "
+            << MPerWmma << "x"<<NPerWmma << ", "
+            << "WaveMap: "
+            << MRepeat << "x" << NRepeat << ", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector << "x" << BBlockTransferSrcScalarPerVector << ", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "KPack: "
+            << GridwiseGemm::KPack;
+        // clang-format on
+
+        return str.str();
+    }
+    REGISTER_EXTRA_PRINTING_METHODS
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
index 4d9dd192c9..db62dd340f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -80,6 +80,10 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
                                                                          BElementwiseOperation,
                                                                          CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr index_t NumATensor = AsDataType::Size();
     static constexpr index_t NumBTensor = BsDataType::Size();
     static constexpr index_t NumDTensor = DsDataType::Size();
@@ -88,7 +92,8 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
     using BLayout = remove_cvref_t<tuple_element_t<0, BsLayout>>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_v3<
         ALayout,
         BLayout,
         CLayout,
@@ -111,7 +116,7 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -136,13 +141,17 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
         BlkGemmPipelineVer,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -570,6 +579,8 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -586,11 +597,10 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
                                                        GemmSpec == GemmSpecialization::NKPadding ||
                                                        GemmSpec == GemmSpecialization::MNKPadding ||
@@ -599,7 +609,22 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -739,7 +764,7 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
index 0a1ec2c1b8..52ecbeea6b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -62,29 +62,32 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const Block2ETileMap block_2_etile_map,
         index_t NRaw)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemmWelford::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx12__)
+    if constexpr(GridwiseGemmWelford::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemmWelford::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemmWelford::template Run<HasMainKBlockLoop>(
-        p_a_grid,
-        p_b_grid,
-        p_ds_grid,
-        p_e_grid,
-        p_welford_mean_grid,
-        p_welford_var_grid,
-        p_welford_count_grid,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        cde_element_op,
-        a_grid_desc_ak0_m_ak1,
-        b_grid_desc_bk0_n_bk1,
-        ds_grid_desc_mblock_mperblock_nblock_nperblock,
-        e_grid_desc_mblock_mperblock_nblock_nperblock,
-        mean_var_grid_desc_mblock_mperblock_nblock,
-        count_grid_desc_mblock_mperblock_nblock,
-        block_2_etile_map,
-        NRaw);
+        GridwiseGemmWelford::template Run<HasMainKBlockLoop>(
+            p_a_grid,
+            p_b_grid,
+            p_ds_grid,
+            p_e_grid,
+            p_welford_mean_grid,
+            p_welford_var_grid,
+            p_welford_count_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            mean_var_grid_desc_mblock_mperblock_nblock,
+            count_grid_desc_mblock_mperblock_nblock,
+            block_2_etile_map,
+            NRaw);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -198,9 +201,9 @@ template <typename ALayout,
           GemmSpecialization GemmSpec,
           index_t NumGemmKPrefetchStage,
           index_t BlockSize,
-          index_t GemmMPerBlock,
-          index_t GemmNPerBlock,
-          index_t GemmKPerBlock,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
           index_t AK1,
           index_t BK1,
           index_t MPerXDL,
@@ -254,6 +257,10 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
     using DeviceOp = DeviceGemmMultipleDLayernorm_Xdl_CShuffle;
     using ELayout  = HLayout;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr index_t NumDTensor                  = DsDataType::Size();
     static constexpr index_t LayernormHDstVectorSize     = PostShuffleScalarPerVector;
     static constexpr index_t LayernormGammaSrcVectorSize = PostShuffleScalarPerVector;
@@ -268,8 +275,8 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
 
-    static constexpr auto matrix_padder = MatrixPadder<GemmSpec, index_t, index_t, index_t>{
-        GemmMPerBlock, GemmNPerBlock, GemmKPerBlock};
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
 
     static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
     {
@@ -326,7 +333,7 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
                 static_assert(is_same<tensor_layout::gemm::RowMajor, DLayout>::value);
 
                 return DeviceOp::
-                    MakeEHGridDescriptor_M_N<Sequence<true, true>, GemmMPerBlock, GemmNPerBlock>(
+                    MakeEHGridDescriptor_M_N<Sequence<true, true>, MPerBlock, NPerBlock>(
                         MRaws[i], NRaws[i], DsStride[i]);
             },
             Number<NumDTensor>{});
@@ -363,12 +370,10 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
     // We have to separate mean var descriptor for gemm and layernorm bacause of different grid
     // layout(different padding)
     using GemmMeanVarGridDesc_M_NBlock =
-        decltype(MakeMeanVarDescriptor_M_N<Sequence<true, false>, GemmMPerBlock, GemmNPerBlock>(1,
-                                                                                                1));
+        decltype(MakeMeanVarDescriptor_M_N<Sequence<true, false>, MPerBlock, NPerBlock>(1, 1));
 
     using GemmCountGridDesc_M_NBlock =
-        decltype(MakeCountDescriptor_M_N<Sequence<true, false>, GemmMPerBlock, GemmNPerBlock>(1,
-                                                                                              1));
+        decltype(MakeCountDescriptor_M_N<Sequence<true, false>, MPerBlock, NPerBlock>(1, 1));
 
     using LayernormMeanVarGridDesc_M_NBlock =
         decltype(MakeMeanVarDescriptor_M_N<Sequence<true, true>,
@@ -383,7 +388,8 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
     using GammaBetaGridDesc_N = decltype(MakeDescriptor_X<LayernormBlockTileSize_M_N::At(1)>(1));
     using EHGridDesc_M_N = decltype(MakeEHGridDescriptor_M_N<Sequence<true, true>, 1, 1>(1, 1, 1));
 
-    using GridwiseGemmWelford = GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmWelfordBase = GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle<
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
         CShuffleDataType,
@@ -401,15 +407,15 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
         GemmCountGridDesc_M_NBlock,
         NumGemmKPrefetchStage,
         BlockSize,
-        GemmMPerBlock,
-        GemmNPerBlock,
-        GemmKPerBlock,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
         AK1,
         BK1,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -432,8 +438,10 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
         PostShuffleScalarPerVector,
         LoopSched,
         PipelineVer>;
+    using GridwiseGemmWelford64 = GridwiseGemmWelfordBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemmWelford32 = GridwiseGemmWelfordBase<NXdlPerWave32>;
 
-    using Block2ETileMap = typename GridwiseGemmWelford::DefaultBlock2ETileMap;
+    using Block2ETileMap = typename GridwiseGemmWelford64::DefaultBlock2ETileMap;
 
     using GridwiseWelfordLayernorm =
         GridwiseWelfordSecondHalfLayernorm2d<EMeanVarDataType,
@@ -491,9 +499,8 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
               b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(KRaw, NRaw, StrideB)},
               ds_grid_desc_m_n_{},
               gemm_e_grid_desc_m_n_{
-                  DeviceOp::MakeEHGridDescriptor_M_N<Sequence<true, true>,
-                                                     GemmMPerBlock,
-                                                     GemmNPerBlock>(MRaw, NRaw, StrideH)},
+                  DeviceOp::MakeEHGridDescriptor_M_N<Sequence<true, true>, MPerBlock, NPerBlock>(
+                      MRaw, NRaw, StrideH)},
               layernorm_e_grid_desc_m_n_{
                   DeviceOp::MakeEHGridDescriptor_M_N<Sequence<true, true>,
                                                      LayernormBlockTileSize_M_N::At(0),
@@ -513,11 +520,11 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
                                                      LayernormBlockTileSize_M_N::At(1)>(
                       MRaw, NRaw, StrideH)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemmWelford::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemmWelford64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemmWelford::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+                  GridwiseGemmWelford64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
               block_2_etile_map_{
-                  GridwiseGemmWelford::MakeDefaultBlock2ETileMap(gemm_e_grid_desc_m_n_)},
+                  GridwiseGemmWelford64::MakeDefaultBlock2ETileMap(gemm_e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op},
@@ -525,16 +532,16 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
               MRaw_{MRaw},
               NRaw_{NRaw},
               KRaw_{KRaw},
-              gemm_nblock_{math::integer_divide_ceil(NRaw, GemmNPerBlock)},
+              gemm_nblock_{math::integer_divide_ceil(NRaw, NPerBlock)},
               epsilon_{static_cast<AccDataType>(epsilon)}
         {
             // We don't need to pad in N dimension in gemm for mean/var/count. Set NPerTile 1.
             gemm_mean_var_grid_desc_m_nblock_ =
-                DeviceOp::MakeMeanVarDescriptor_M_N<Sequence<true, false>, GemmMPerBlock, 1>(
+                DeviceOp::MakeMeanVarDescriptor_M_N<Sequence<true, false>, MPerBlock, 1>(
                     MRaw, gemm_nblock_);
 
             gemm_count_grid_desc_m_nblock_ =
-                DeviceOp::MakeCountDescriptor_M_N<Sequence<true, false>, GemmMPerBlock, 1>(
+                DeviceOp::MakeCountDescriptor_M_N<Sequence<true, false>, MPerBlock, 1>(
                     MRaw, gemm_nblock_);
 
             layernorm_mean_var_grid_desc_m_nblock_ =
@@ -558,33 +565,66 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
 
                 // D desc
                 ds_grid_desc_m_n_(i) =
-                    DeviceOp::MakeEHGridDescriptor_M_N<Sequence<true, true>,
-                                                       GemmMPerBlock,
-                                                       GemmNPerBlock>(MRaw, NRaw, StrideDs[i]);
+                    DeviceOp::MakeEHGridDescriptor_M_N<Sequence<true, true>, MPerBlock, NPerBlock>(
+                        MRaw, NRaw, StrideDs[i]);
             });
 
             // populate desc for Ds/E/mean/var/count
-            if(GridwiseGemmWelford::CheckValidity(a_grid_desc_m_k_,
-                                                  b_grid_desc_n_k_,
-                                                  ds_grid_desc_m_n_,
-                                                  gemm_e_grid_desc_m_n_,
-                                                  block_2_etile_map_))
+            if(get_warp_size() == 64)
             {
-                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemmWelford::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        ds_grid_desc_m_n_);
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    if(GridwiseGemmWelford64::CheckValidity(a_grid_desc_m_k_,
+                                                            b_grid_desc_n_k_,
+                                                            ds_grid_desc_m_n_,
+                                                            gemm_e_grid_desc_m_n_,
+                                                            block_2_etile_map_))
+                    {
+                        ds_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmWelford64::
+                            MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                ds_grid_desc_m_n_);
 
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemmWelford::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        gemm_e_grid_desc_m_n_);
+                        e_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmWelford64::
+                            MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                gemm_e_grid_desc_m_n_);
 
-                gemm_mean_var_grid_desc_mblock_mperblock_nblock_ =
-                    GridwiseGemmWelford::MakeMeanVarCountGridDescriptor_MBlock_MPerBlock_NBlock(
-                        gemm_mean_var_grid_desc_m_nblock_);
+                        gemm_mean_var_grid_desc_mblock_mperblock_nblock_ = GridwiseGemmWelford64::
+                            MakeMeanVarCountGridDescriptor_MBlock_MPerBlock_NBlock(
+                                gemm_mean_var_grid_desc_m_nblock_);
 
-                gemm_count_grid_desc_mblock_mperblock_nblock_ =
-                    GridwiseGemmWelford::MakeMeanVarCountGridDescriptor_MBlock_MPerBlock_NBlock(
-                        gemm_count_grid_desc_m_nblock_);
+                        gemm_count_grid_desc_mblock_mperblock_nblock_ = GridwiseGemmWelford64::
+                            MakeMeanVarCountGridDescriptor_MBlock_MPerBlock_NBlock(
+                                gemm_count_grid_desc_m_nblock_);
+                    }
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    if(GridwiseGemmWelford32::CheckValidity(a_grid_desc_m_k_,
+                                                            b_grid_desc_n_k_,
+                                                            ds_grid_desc_m_n_,
+                                                            gemm_e_grid_desc_m_n_,
+                                                            block_2_etile_map_))
+                    {
+                        ds_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmWelford32::
+                            MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                ds_grid_desc_m_n_);
+
+                        e_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmWelford32::
+                            MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                gemm_e_grid_desc_m_n_);
+
+                        gemm_mean_var_grid_desc_mblock_mperblock_nblock_ = GridwiseGemmWelford32::
+                            MakeMeanVarCountGridDescriptor_MBlock_MPerBlock_NBlock(
+                                gemm_mean_var_grid_desc_m_nblock_);
+
+                        gemm_count_grid_desc_mblock_mperblock_nblock_ = GridwiseGemmWelford32::
+                            MakeMeanVarCountGridDescriptor_MBlock_MPerBlock_NBlock(
+                                gemm_count_grid_desc_m_nblock_);
+                    }
+                }
             }
         }
 
@@ -602,7 +642,7 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemmWelford::DsGridPointer p_ds_grid_;
+        typename GridwiseGemmWelford64::DsGridPointer p_ds_grid_;
         void* p_workspace_e_grid_;
         void* p_workspace_mean_;
         void* p_workspace_var_;
@@ -626,15 +666,15 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
         EHGridDesc_M_N h_grid_desc_m_n_;
 
         // tensor descriptors for block/thread-wise copy
-        typename GridwiseGemmWelford::DefaultAGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
-        typename GridwiseGemmWelford::DefaultBGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemmWelford::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        typename GridwiseGemmWelford64::DefaultAGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        typename GridwiseGemmWelford64::DefaultBGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        typename GridwiseGemmWelford64::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
             ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemmWelford::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        typename GridwiseGemmWelford64::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
             e_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemmWelford::MeanVarGridDescriptor_MBlock_MPerBlock_NBlock
+        typename GridwiseGemmWelford64::MeanVarGridDescriptor_MBlock_MPerBlock_NBlock
             gemm_mean_var_grid_desc_mblock_mperblock_nblock_;
-        typename GridwiseGemmWelford::CountGridDescriptor_MBlock_MPerBlock_NBlock
+        typename GridwiseGemmWelford64::CountGridDescriptor_MBlock_MPerBlock_NBlock
             gemm_count_grid_desc_mblock_mperblock_nblock_;
 
         // block-to-e-tile map
@@ -657,8 +697,8 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
     struct Invoker : public BaseInvoker
     {
         using Argument = DeviceOp::Argument;
-
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemmWelford>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float avg_time = 0;
 
@@ -787,7 +827,9 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
                 return launch_kernel(integral_constant<bool, false>{});
             }
         }
-
+        using GridwiseGemm32 = GridwiseGemmWelford32;
+        using GridwiseGemm64 = GridwiseGemmWelford64;
+        INVOKER_RUN_IMPL
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -857,11 +899,10 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         // check vector load/store
         {
             using Row = ck::tensor_layout::gemm::RowMajor;
@@ -944,12 +985,36 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
                 return false;
             }
         }
-
-        return GridwiseGemmWelford::CheckValidity(arg.a_grid_desc_m_k_,
-                                                  arg.b_grid_desc_n_k_,
-                                                  arg.ds_grid_desc_m_n_,
-                                                  arg.gemm_e_grid_desc_m_n_,
-                                                  arg.block_2_etile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemmWelford64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                            arg.b_grid_desc_n_k_,
+                                                            arg.ds_grid_desc_m_n_,
+                                                            arg.gemm_e_grid_desc_m_n_,
+                                                            arg.block_2_etile_map_);
+            }
+            else
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemmWelford32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                            arg.b_grid_desc_n_k_,
+                                                            arg.ds_grid_desc_m_n_,
+                                                            arg.gemm_e_grid_desc_m_n_,
+                                                            arg.block_2_etile_map_);
+            }
+            else
+            {
+                return false;
+            }
+        }
     }
 
     // polymorphic
@@ -1060,9 +1125,9 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
         str << "DeviceGemmMultipleDLayernorm_Xdl_CShuffle"
             << "<"
             << BlockSize << ", "
-            << GemmMPerBlock << ", "
-            << GemmNPerBlock << ", "
-            << GemmKPerBlock << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
             << AK1 << ", "
             << BK1 << ", "
             << getGemmSpecializationString(GemmSpec) << ", "
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index 8ae6761769..ccc9c7f9b8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -60,26 +60,30 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
         const Block2ETileMap block_2_etile_map)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_ds_grid,
-                                                  p_e_grid,
-                                                  p_rs_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  qs_element_op,
-                                                  rs_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  rs_grid_desc_mblock_mperblock,
-                                                  block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid,
+            p_b_grid,
+            p_ds_grid,
+            p_e_grid,
+            p_rs_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            qs_element_op,
+            rs_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            rs_grid_desc_mblock_mperblock,
+            block_2_etile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -185,6 +189,10 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
 {
     using DeviceOp = DeviceGemmMultipleDMultipleR_Xdl_CShuffle;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr index_t NumDTensor = DsDataType::Size();
     static constexpr index_t NumRTensor = RsDataType::Size();
 
@@ -282,7 +290,8 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
     using RGridDesc_M   = decltype(MakeRGridDescriptor_M(1));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -312,7 +321,7 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -335,15 +344,17 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
         CDEReduceThreadTransferScalarPerVector_NPerBlock,
         RThreadTransferDstScalarPerVector_MPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
 
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+    using Block2ETileMap = typename GridwiseGemm64::DefaultBlock2ETileMap;
 
     // Argument
     struct Argument : public BaseArgument
@@ -370,86 +381,52 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
               p_ds_grid_{}, // FIXME
               p_e_grid_{static_cast<EDataType*>(p_e_grid)},
               p_rs_grid_{}, // FIXME
+              MRaw_(MRaw),
+              NRaw_(NRaw),
               a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(MRaw, KRaw, StrideA)},
               b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(KRaw, NRaw, StrideB)},
               e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(MRaw, NRaw, StrideE)},
               r_grid_desc_m_{DeviceOp::MakeRGridDescriptor_M(MRaw)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
-              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              rs_grid_desc_mblock_mperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op},
               qs_element_op_{qs_element_op},
               rs_element_op_{rs_element_op}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                           b_grid_desc_n_k_,
-                                           e_grid_desc_m_n_,
-                                           r_grid_desc_m_,
-                                           block_2_etile_map_))
-            {
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-
-                static_for<0, NumDTensor, 1>{}([&](auto i) {
-                    using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
-
-                    p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
-
-                    const auto d_grid_desc_m_n =
-                        DeviceOp::MakeEGridDescriptor_M_N(MRaw, NRaw, StrideDs[i]);
-
-                    ds_grid_desc_mblock_mperblock_nblock_nperblock_(i) =
-                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            d_grid_desc_m_n);
-                });
-
-                static_for<0, NumRTensor, 1>{}([&](auto i) {
-                    using RDataType = remove_cvref_t<tuple_element_t<i.value, RsDataType>>;
-
-                    p_rs_grid_(i) = static_cast<RDataType*>(p_rs_grid[i]);
-
-                    rs_grid_desc_mblock_mperblock_(i) =
-                        GridwiseGemm::MakeRGridDescriptor_MBlock_MPerBlock(r_grid_desc_m_);
-                });
-            }
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                p_ds_grid_(i)   = static_cast<const DDataType*>(p_ds_grid[i]);
+                stride_ds_[i]   = StrideDs[i];
+            });
+            static_for<0, NumRTensor, 1>{}([&](auto i) {
+                using RDataType = remove_cvref_t<tuple_element_t<i.value, RsDataType>>;
+                p_rs_grid_(i)   = static_cast<RDataType*>(p_rs_grid[i]);
+            });
         }
 
         //  private:
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
-        typename GridwiseGemm::RsGridPointer p_rs_grid_;
-
+        typename GridwiseGemm64::RsGridPointer p_rs_grid_;
+        index_t MRaw_;
+        index_t NRaw_;
+        std::array<index_t, NumDTensor> stride_ds_;
         // tensor descriptors
         AGridDesc_M_K a_grid_desc_m_k_;
         BGridDesc_N_K b_grid_desc_n_k_;
         EGridDesc_M_N e_grid_desc_m_n_;
         RGridDesc_M r_grid_desc_m_;
-
         // tensor descriptors for block/thread-wise copy
         AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        StaticallyIndexedArray<
-            typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-            NumDTensor>
-            ds_grid_desc_mblock_mperblock_nblock_nperblock_; // FIXME: Ds desc may be of different
-                                                             // type from E
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
-
-        StaticallyIndexedArray<typename GridwiseGemm::RGridDescriptor_MBlock_MPerBlock, NumRTensor>
-            rs_grid_desc_mblock_mperblock_;
-
         // block-to-e-tile map
         Block2ETileMap block_2_etile_map_;
 
@@ -466,7 +443,8 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                             arg.b_grid_desc_n_k_,
@@ -476,6 +454,31 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
+            StaticallyIndexedArray<
+                typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                NumDTensor>
+                ds_grid_desc_mblock_mperblock_nblock_nperblock = {};
+
+            StaticallyIndexedArray<typename GridwiseGemm::RGridDescriptor_MBlock_MPerBlock,
+                                   NumRTensor>
+                rs_grid_desc_mblock_mperblock = {};
+
+            auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.e_grid_desc_m_n_);
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                const auto d_grid_desc_m_n =
+                    DeviceOp::MakeEGridDescriptor_M_N(arg.MRaw_, arg.NRaw_, arg.stride_ds_[i]);
+                ds_grid_desc_mblock_mperblock_nblock_nperblock(i) =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        d_grid_desc_m_n);
+            });
+
+            static_for<0, NumRTensor, 1>{}([&](auto i) {
+                rs_grid_desc_mblock_mperblock(i) =
+                    GridwiseGemm64::MakeRGridDescriptor_MBlock_MPerBlock(arg.r_grid_desc_m_);
+            });
 
             const index_t grid_size =
                 arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
@@ -526,9 +529,9 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
                                               arg.rs_element_op_,
                                               arg.a_grid_desc_ak0_m_ak1_,
                                               arg.b_grid_desc_bk0_n_bk1_,
-                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.rs_grid_desc_mblock_mperblock_,
+                                              ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              rs_grid_desc_mblock_mperblock,
                                               arg.block_2_etile_map_);
             };
 
@@ -546,6 +549,8 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -556,16 +561,33 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                           arg.b_grid_desc_n_k_,
-                                           arg.e_grid_desc_m_n_,
-                                           arg.r_grid_desc_m_,
-                                           arg.block_2_etile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.r_grid_desc_m_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.r_grid_desc_m_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp
new file mode 100644
index 0000000000..b7cc7bd7d0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/// @brief \"Universal\" GEMM operation with SplitK support and multiple D tensors.
+///
+/// @par Overview
+///         This GEMM operation implements the following mathematical equation:
+///         E{M,N} = CDE_op(A_op(A{M,K}) * B_op(B{K,N}), Ds{M,N}...)
+///         Where A, B, Ds are input tensors and E is the output tensor. The A/B are elementwise
+//          operations that could be applied on each tensor respectively. The CDE_op is an
+//          elementwise operation applied to the C and all D tensors.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam DsLayout    D tensors data layouts.
+/// @tparam ELayout     E tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam DsDataType  D tensors data types.
+/// @tparam EDataType   E tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CDEElementwiseOperation Elementwise operation applied to the C output tensor (after
+///                                 GEMM) and D input tensors.
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerWmma    M size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam NPerWmma    N size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam MRepeat     The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NRepeat     The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CDEShuffleBlockTransferScalarPerVectors The size of vectorized memory access.
+///                                         Used when loading data from D tensors and storing data
+///                                         to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = EDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceGemmMultipleD_Wmma_CShuffleV3
+    : public DeviceGemmMultipleDSplitK<ALayout,
+                                       BLayout,
+                                       DsLayout,
+                                       ELayout,
+                                       ADataType,
+                                       BDataType,
+                                       DsDataType,
+                                       EDataType,
+                                       AElementwiseOperation,
+                                       BElementwiseOperation,
+                                       CDEElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3<
+        ALayout,
+        BLayout,
+        DsLayout,
+        ELayout,
+        Tuple<ADataType>,
+        Tuple<BDataType>,
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    using DeviceGemmCommon =
+        DeviceGemm_Wmma_CShuffleV3_Common<GridwiseGemm,
+                                          Tuple<ADataType>,
+                                          Tuple<BDataType>,
+                                          DsDataType,
+                                          EDataType,
+                                          MPerBlock,
+                                          NPerBlock,
+                                          KPerBlock,
+                                          BlockSize,
+                                          AK1,
+                                          BK1,
+                                          GemmSpec,
+                                          CDEShuffleBlockTransferScalarPerVectors,
+                                          BlkGemmPipeSched,
+                                          BlkGemmPipelineVer,
+                                          ComputeTypeA,
+                                          ComputeTypeB>;
+
+    // Invoker
+    using Invoker = typename DeviceGemmCommon::Invoker;
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return DeviceGemmCommon::IsSupportedArgument(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_e,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideE,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{std::array<const void*, 1>{p_a},
+                        std::array<const void*, 1>{p_b},
+                        p_ds,
+                        static_cast<EDataType*>(p_e),
+                        M,
+                        N,
+                        K,
+                        std::array<index_t, 1>{StrideA},
+                        std::array<index_t, 1>{StrideB},
+                        StrideDs,
+                        StrideE,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t StrideA,
+                        index_t StrideB,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        index_t StrideE,
+                        index_t KBatch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(std::array<const void*, 1>{p_a},
+                                          std::array<const void*, 1>{p_b},
+                                          p_ds,
+                                          static_cast<EDataType*>(p_e),
+                                          M,
+                                          N,
+                                          K,
+                                          std::array<index_t, 1>{StrideA},
+                                          std::array<index_t, 1>{StrideB},
+                                          StrideDs,
+                                          StrideE,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemmMultipleD_Wmma_CShuffleV3"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0];
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+            str << std::string(DLayout::name)[0];
+        });
+        str << std::string(ELayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock << "x" << NPerBlock << "x" << KPerBlock << ", "
+            << "WaveTile: "
+            << MPerWmma << "x"<<NPerWmma << ", "
+            << "WaveMap: "
+            << MRepeat << "x" << NRepeat << ", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector << "x" << BBlockTransferSrcScalarPerVector << ", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "KPack: "
+            << GridwiseGemm::KPack;
+        // clang-format on
+
+        return str.str();
+    }
+    REGISTER_EXTRA_PRINTING_METHODS
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index c7481997a9..c051a080ea 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -54,23 +54,26 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                             e_grid_desc_mblock_mperblock_nblock_nperblock,
                                         const Block2ETileMap block_2_etile_map)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-        p_a_grid,
-        p_b_grid,
-        p_ds_grid,
-        p_e_grid,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        cde_element_op,
-        a_grid_desc_ak0_m_ak1,
-        b_grid_desc_bk0_n_bk1,
-        ds_grid_desc_mblock_mperblock_nblock_nperblock,
-        e_grid_desc_mblock_mperblock_nblock_nperblock,
-        block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+            p_a_grid,
+            p_b_grid,
+            p_ds_grid,
+            p_e_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_etile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -161,6 +164,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                                                                      CDEElementwiseOperation>
 {
     using DeviceOp = DeviceGemmMultipleD_Xdl_CShuffle;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr index_t NumDTensor = DsDataType::Size();
 
@@ -247,7 +253,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
     using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N<ELayout>(1, 1, 1));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle<
         ADataType,
         BDataType,
         ComputeDataType,
@@ -268,7 +275,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -291,24 +298,26 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched,
         PipelineVer>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // desc for blockwise copy
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     // block-to-e-tile map
     using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
 #ifndef __HIPCC_RTC__
     // Argument
@@ -337,12 +346,10 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
               ds_grid_desc_m_n_{},
               e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<ELayout>(MRaw, NRaw, StrideE)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
-              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op},
@@ -362,22 +369,6 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                 ds_grid_desc_m_n_(i) =
                     DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaw, NRaw, StrideDs[i]);
             });
-
-            // populate desc for Ds/E
-            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                           b_grid_desc_n_k_,
-                                           ds_grid_desc_m_n_,
-                                           e_grid_desc_m_n_,
-                                           block_2_etile_map_))
-            {
-                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        ds_grid_desc_m_n_);
-
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-            }
         }
 
         void Print() const
@@ -393,7 +384,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptors for problem definiton
@@ -405,9 +396,6 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
         // tensor descriptors for block/thread-wise copy
         AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
 
         // block-to-e-tile map
         Block2ETileMap block_2_etile_map_;
@@ -428,7 +416,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                             arg.b_grid_desc_n_k_,
@@ -438,6 +427,13 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
+            auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.ds_grid_desc_m_n_);
+
+            auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.e_grid_desc_m_n_);
 
             const index_t grid_size =
                 arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
@@ -475,8 +471,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                                               arg.cde_element_op_,
                                               arg.a_grid_desc_ak0_m_ak1_,
                                               arg.b_grid_desc_bk0_n_bk1_,
-                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
                                               arg.block_2_etile_map_);
             };
 
@@ -492,6 +488,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
             }
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -585,17 +583,38 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
 #ifndef __HIPCC_RTC__
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(!IsSupported(arg.MRaw_, arg.NRaw_, arg.KRaw_))
         {
             return false;
         }
 
-        return IsSupported(arg.MRaw_, arg.NRaw_, arg.KRaw_) and
-               GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                           arg.b_grid_desc_n_k_,
-                                           arg.ds_grid_desc_m_n_,
-                                           arg.e_grid_desc_m_n_,
-                                           arg.block_2_etile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -735,18 +754,18 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
         using EGridDesc_M_N =
             remove_cvref_t<decltype(DeviceOp::matrix_padder.PadCDescriptor_M_N(EDesc{}))>;
         using AGridDesc_AK0_M_AK1 =
-            remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+            remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
                 DeviceOp::matrix_padder.PadADescriptor_M_K(ADesc{})))>;
         using BGridDesc_BK0_N_BK1 =
-            remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+            remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
                 DeviceOp::matrix_padder.PadBDescriptor_N_K(BDesc{})))>;
         using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-            decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 ds_tuple()))>;
         using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-            decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 DeviceOp::matrix_padder.PadCDescriptor_M_N(EDesc{})))>;
-        using Block2ETileMap = remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(
+        using Block2ETileMap = remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(
             DeviceOp::matrix_padder.PadCDescriptor_M_N(EDesc{})))>;
 
         // tensor descriptors for problem definiton
@@ -790,21 +809,21 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                   ds)},
               e_grid_desc_m_n{DeviceOp::matrix_padder.PadCDescriptor_M_N(e)},
               a_grid_desc_ak0_m_ak1{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k)},
               b_grid_desc_bk0_n_bk1{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k)},
               ds_grid_desc_mblock_mperblock_nblock_nperblock{
-                  GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                  GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                       transform_tuples(
                           [&](auto d) constexpr {
                               return DeviceOp::matrix_padder.PadCDescriptor_M_N(d);
                           },
                           ds))},
               e_grid_desc_mblock_mperblock_nblock_nperblock{
-                  GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                  GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                       e_grid_desc_m_n)},
-              block_2_etile_map{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n)},
-              has_main_k_block_loop{GridwiseGemm::CalculateHasMainKBlockLoop(
+              block_2_etile_map{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n)},
+              has_main_k_block_loop{GridwiseGemm64::CalculateHasMainKBlockLoop(
                   a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2))},
               a_element_op{a_element_op_},
               b_element_op{b_element_op_},
@@ -817,12 +836,33 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
 
         constexpr bool IsValid() const
         {
-            return GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                               b_grid_desc_n_k,
-                                               ds_grid_desc_m_n,
-                                               e_grid_desc_m_n,
-                                               block_2_etile_map) and
-                   IsSupported(MRaw, NRaw, KRaw);
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    return GridwiseGemm64::CheckValidity(a_grid_desc_m_k,
+                                                         b_grid_desc_n_k,
+                                                         ds_grid_desc_m_n,
+                                                         e_grid_desc_m_n,
+                                                         block_2_etile_map) and
+                           IsSupported(MRaw, NRaw, KRaw) and
+                           GridwiseGemm64::template IsValidCompilationParameter<>();
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    return GridwiseGemm32::CheckValidity(a_grid_desc_m_k,
+                                                         b_grid_desc_n_k,
+                                                         ds_grid_desc_m_n,
+                                                         e_grid_desc_m_n,
+                                                         block_2_etile_map) and
+                           IsSupported(MRaw, NRaw, KRaw) and
+                           GridwiseGemm32::template IsValidCompilationParameter<>();
+                }
+            }
+            return false;
         }
 
         constexpr index_t GetBlockSize() const { return BlockSize; }
@@ -854,10 +894,12 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                                DsPointer p_ds_grid,
                                EDataType* __restrict__ p_e_grid)
     {
-        __shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
 #ifndef __HIPCC_RTC__
         assert(desc.IsValid());
 #endif
+        using GridwiseGemm = conditional_t<get_warp_size() == 64, GridwiseGemm64, GridwiseGemm32>;
+        __shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
         if(desc.has_main_k_block_loop)
         {
             GridwiseGemm::template Run<true, InMemoryDataOperationEnum::Set>(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
index 010a23c66c..66e727faa5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -74,10 +74,14 @@ struct DeviceGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
                                  BElementwiseOperation,
                                  CDEElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr auto I1            = Number<1>{};
     static constexpr index_t NumDTensor = DsDataType::Size();
 
-    using GridwiseGemm = GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad<
         ALayout,
         BLayout,
         DsLayout,
@@ -104,7 +108,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferSrcAccessOrder,
         ABlockTransferSrcVectorDim,
@@ -121,13 +125,17 @@ struct DeviceGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched,
         PipelineVer>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     struct Invoker : public BaseInvoker
     {
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                             arg.b_grid_desc_n_k_,
@@ -191,6 +199,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
             }
         }
 
+        INVOKER_RUN3_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -200,11 +210,10 @@ struct DeviceGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if(!ck::is_lds_direct_load_supported())
         {
             return false;
@@ -288,11 +297,29 @@ struct DeviceGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
             }
         }
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                           arg.b_grid_desc_n_k_,
-                                           arg.ds_grid_desc_m_n_,
-                                           arg.e_grid_desc_m_n_,
-                                           arg.block_2_etile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
index 0796614bd4..7e9020d796 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -82,9 +82,13 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
                                                                            CElementwiseOperation>
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultiD_xdl_cshuffle_v3<
         ALayout,
         BLayout,
         DsLayout,
@@ -108,7 +112,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -126,7 +130,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
         false,
         BBlockLdsExtraN,
         CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
+        math::min(CShuffleNXdlPerWavePerShuffle, NXdlPerWave_),
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEShuffleBlockTransferScalarPerVectors,
         BlkGemmPipeSched,
@@ -135,13 +139,16 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
         ComputeTypeB,
         LDSTypeA,
         LDSTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
-
+    using Argument = typename GridwiseGemm64::Argument;
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -169,7 +176,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
 
                     std::array<std::size_t, NumDTensor> DsSize;
 
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -188,8 +195,13 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
                         using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
                         DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                     });
-                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    ck::utility::RotatingMemWrapperMultiD<typename GridwiseGemm::Argument,
+                                                          DsDataType>
+                        rotating_mem(arg_,
+                                     stream_config.rotating_count,
+                                     size_a_buffer,
+                                     size_b_buffer,
+                                     DsSize);
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -624,6 +636,8 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -640,11 +654,14 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(is_gfx11_supported() && arg.KBatch > 1)
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -658,7 +675,22 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -781,7 +813,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
index f444399812..5ba24d04eb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -92,10 +92,14 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
                                          BElementwiseOperation,
                                          CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3<
         ALayout,
         BLayout,
         DsLayout,
@@ -122,7 +126,7 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -140,7 +144,7 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
         false,
         BBlockLdsExtraN,
         CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
+        math::min(CShuffleNXdlPerWavePerShuffle, NXdlPerWave_),
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEShuffleBlockTransferScalarPerVectors,
         BlkGemmPipeSched,
@@ -149,13 +153,17 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
         ComputeTypeB,
         LDSTypeA,
         LDSTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<math::max(NXdlPerWave32, 1)>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -180,7 +188,7 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -192,7 +200,7 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
                     auto size_b_buffer =
                         b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
 
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
                     rotating_mem.Print();
 
@@ -293,6 +301,8 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -309,7 +319,7 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
@@ -328,7 +338,22 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -456,7 +481,7 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp
index 4761ee2026..832267fdd9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -82,10 +82,14 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
                                                   BElementwiseOperation,
                                                   CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle<
         ALayout,
         BLayout,
         DsLayout,
@@ -109,7 +113,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -136,15 +140,18 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
         ComputeTypeB,
         LDSTypeA,
         LDSTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
-
+    using Argument = typename GridwiseGemm64::Argument;
     int GetPreShuffleParameters() override { return NPerXDL; }
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -172,7 +179,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
 
                     std::array<std::size_t, NumDTensor> DsSize;
 
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -191,8 +198,13 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
                         using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
                         DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                     });
-                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    ck::utility::RotatingMemWrapperMultiD<typename GridwiseGemm::Argument,
+                                                          DsDataType>
+                        rotating_mem(arg_,
+                                     stream_config.rotating_count,
+                                     size_a_buffer,
+                                     size_b_buffer,
+                                     DsSize);
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -492,6 +504,8 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -508,11 +522,14 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(is_gfx11_supported() && arg.KBatch > 1)
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -531,7 +548,22 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -652,7 +684,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp
index c446ca59ea..4dcdff9153 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp
@@ -93,9 +93,13 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
                                                         CElementwiseOperation>
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle<
         ALayout,
         BLayout,
         DsLayout,
@@ -122,7 +126,7 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -140,7 +144,7 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
         false,
         BBlockLdsExtraN,
         CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
+        math::min(CShuffleNXdlPerWavePerShuffle, NXdlPerWave_),
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEShuffleBlockTransferScalarPerVectors,
         BlkGemmPipeSched,
@@ -149,15 +153,19 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
         ComputeTypeB,
         LDSTypeA,
         LDSTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<math::max(NXdlPerWave32, 1)>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     int GetPreShuffleParameters() override { return NPerXDL; }
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -182,7 +190,7 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -194,7 +202,7 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
                     auto size_b_buffer =
                         b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
 
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
                     rotating_mem.Print();
 
@@ -322,6 +330,8 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -338,16 +348,19 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         // if(ScaleBlockM % MPerBlock != 0 || ScaleBlockN % NPerBlock != 0 || ScaleBlockK !=
         // KPerBlock)
         // {
         //     return false;
         // }
+        if(is_gfx11_supported() && arg.KBatch > 1)
+        {
+            return false;
+        }
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -367,7 +380,22 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -495,7 +523,7 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
index c88294edfa..14a1824508 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -78,6 +78,10 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
 {
     using DeviceOp = DeviceGemmReduce_Xdl_CShuffle;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -378,7 +382,8 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
     using ReduceGridDesc_M    = decltype(MakeReduceGridDescriptor_M(1));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -407,7 +412,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -432,6 +437,8 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
         CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
         CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -459,27 +466,13 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
               b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
               c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
               reduce_grid_desc_m_{DeviceOp::MakeReduceGridDescriptor_M(MRaw)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              reduce_grid_desc_mblock_mperblock_{},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              block_2_ctile_map_{GridwiseGemm64::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               c_element_op_{c_element_op},
               reduce_in_element_ops_{reduce_in_element_ops},
               reduce_out_element_ops_{reduce_out_element_ops}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-
-                reduce_grid_desc_mblock_mperblock_ =
-                    GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(reduce_grid_desc_m_);
-            }
         }
 
         //  private:
@@ -491,11 +484,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
         CGridDesc_M_N c_grid_desc_m_n_;
         ReduceGridDesc_M reduce_grid_desc_m_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock
-            reduce_grid_desc_mblock_mperblock_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        typename GridwiseGemm64::DefaultBlock2CTileMap block_2_ctile_map_;
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
         CElementwiseOperation c_element_op_;
@@ -508,7 +497,8 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
@@ -536,6 +526,12 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
+            auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c_grid_desc_m_n_);
+
+            auto reduce_grid_desc_mblock_mperblock =
+                GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(arg.reduce_grid_desc_m_);
 
             const index_t grid_size =
                 arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
@@ -563,26 +559,25 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     true>;
 
-                elapsed_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.p_reduces_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.reduce_in_element_ops_,
-                                           arg.reduce_out_element_ops_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.reduce_grid_desc_mblock_mperblock_,
-                                           arg.block_2_ctile_map_);
+                elapsed_time = launch_and_time_kernel(stream_config,
+                                                      kernel,
+                                                      dim3(grid_size),
+                                                      dim3(BlockSize),
+                                                      0,
+                                                      arg.p_a_grid_,
+                                                      arg.p_b_grid_,
+                                                      arg.p_c_grid_,
+                                                      arg.p_reduces_grid_,
+                                                      arg.a_element_op_,
+                                                      arg.b_element_op_,
+                                                      arg.c_element_op_,
+                                                      arg.reduce_in_element_ops_,
+                                                      arg.reduce_out_element_ops_,
+                                                      arg.a_grid_desc_ak0_m_ak1_,
+                                                      arg.b_grid_desc_bk0_n_bk1_,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      reduce_grid_desc_mblock_mperblock,
+                                                      arg.block_2_ctile_map_);
             }
             else
             {
@@ -603,31 +598,32 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     false>;
 
-                elapsed_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.p_reduces_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.reduce_in_element_ops_,
-                                           arg.reduce_out_element_ops_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.reduce_grid_desc_mblock_mperblock_,
-                                           arg.block_2_ctile_map_);
+                elapsed_time = launch_and_time_kernel(stream_config,
+                                                      kernel,
+                                                      dim3(grid_size),
+                                                      dim3(BlockSize),
+                                                      0,
+                                                      arg.p_a_grid_,
+                                                      arg.p_b_grid_,
+                                                      arg.p_c_grid_,
+                                                      arg.p_reduces_grid_,
+                                                      arg.a_element_op_,
+                                                      arg.b_element_op_,
+                                                      arg.c_element_op_,
+                                                      arg.reduce_in_element_ops_,
+                                                      arg.reduce_out_element_ops_,
+                                                      arg.a_grid_desc_ak0_m_ak1_,
+                                                      arg.b_grid_desc_bk0_n_bk1_,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      reduce_grid_desc_mblock_mperblock,
+                                                      arg.block_2_ctile_map_);
             }
 
             return elapsed_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -644,15 +640,31 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperatio
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
index a921962c67..2ceeb39bac 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
@@ -177,15 +177,16 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
                                                         BElementwiseOperation,
                                                         CElementwiseOperation>
 {
-    // GridwiseGemm
     using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3<
         ALayout,
         BLayout,
+        Tuple<>, // DsLayout
         CLayout,
-        ADataType,
-        BDataType,
+        Tuple<ADataType>,
+        Tuple<BDataType>,
         AccDataType,
         CShuffleDataType,
+        Tuple<>, // DsDataType
         CDataType,
         AElementwiseOperation,
         BElementwiseOperation,
@@ -220,7 +221,7 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
         CShuffleMRepeatPerShuffle,
         CShuffleNRepeatPerShuffle,
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        Sequence<CShuffleBlockTransferScalarPerVector_NPerBlock>,
         BlkGemmPipeSched,
         BlkGemmPipelineVer,
         ComputeTypeA,
@@ -230,21 +231,24 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
-    using DeviceGemmCommon = DeviceGemm_Wmma_CShuffleV3_Common<GridwiseGemm,
-                                                               ADataType,
-                                                               BDataType,
-                                                               CDataType,
-                                                               MPerBlock,
-                                                               NPerBlock,
-                                                               KPerBlock,
-                                                               BlockSize,
-                                                               AK1,
-                                                               BK1,
-                                                               GemmSpec,
-                                                               BlkGemmPipeSched,
-                                                               BlkGemmPipelineVer,
-                                                               ComputeTypeA,
-                                                               ComputeTypeB>;
+    using DeviceGemmCommon =
+        DeviceGemm_Wmma_CShuffleV3_Common<GridwiseGemm,
+                                          Tuple<ADataType>,
+                                          Tuple<BDataType>,
+                                          Tuple<>,
+                                          CDataType,
+                                          MPerBlock,
+                                          NPerBlock,
+                                          KPerBlock,
+                                          BlockSize,
+                                          AK1,
+                                          BK1,
+                                          GemmSpec,
+                                          Sequence<CShuffleBlockTransferScalarPerVector_NPerBlock>,
+                                          BlkGemmPipeSched,
+                                          BlkGemmPipelineVer,
+                                          ComputeTypeA,
+                                          ComputeTypeB>;
 
     // Invoker
     using Invoker = typename DeviceGemmCommon::Invoker;
@@ -275,11 +279,25 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
                              index_t StrideB,
                              index_t StrideC,
                              index_t KBatch,
-                             AElementwiseOperation,
-                             BElementwiseOperation,
-                             CElementwiseOperation)
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation cde_element_op)
     {
-        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, KBatch};
+        return Argument{std::array<const void*, 1>{p_a},
+                        std::array<const void*, 1>{p_b},
+                        std::array<const void*, 0>{}, // p_ds_grid_
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        std::array<index_t, 1>{StrideA},
+                        std::array<index_t, 1>{StrideB},
+                        std::array<index_t, 0>{}, // StrideDs_
+                        StrideC,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -295,20 +313,25 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
                                                       index_t StrideB,
                                                       index_t StrideC,
                                                       index_t KBatch,
-                                                      AElementwiseOperation,
-                                                      BElementwiseOperation,
-                                                      CElementwiseOperation) override
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
     {
-        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
-                                          static_cast<const BDataType*>(p_b),
+        return std::make_unique<Argument>(std::array<const void*, 1>{p_a},
+                                          std::array<const void*, 1>{p_b},
+                                          std::array<const void*, 0>{}, // p_ds_grid_
                                           static_cast<CDataType*>(p_c),
                                           M,
                                           N,
                                           K,
-                                          StrideA,
-                                          StrideB,
+                                          std::array<index_t, 1>{StrideA},
+                                          std::array<index_t, 1>{StrideB},
+                                          std::array<index_t, 0>{}, // StrideDs_
                                           StrideC,
-                                          KBatch);
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp
index 1a68b35f1f..5e9a861f41 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp
@@ -89,11 +89,14 @@ struct DeviceGemm_BScale_Wmma_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
     using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3_b_scale<
         ALayout,
         BLayout,
+        Tuple<>, // DsLayout
         CLayout,
-        ADataType,
-        BDataType,
+        Tuple<ADataType>,
+        Tuple<BDataType>,
+        BScaleDataType,
         AccDataType,
         CShuffleDataType,
+        Tuple<>, // DsDataType
         CDataType,
         AElementwiseOperation,
         BElementwiseOperation,
@@ -130,7 +133,7 @@ struct DeviceGemm_BScale_Wmma_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
         CShuffleMRepeatPerShuffle,
         CShuffleNRepeatPerShuffle,
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        Sequence<CShuffleBlockTransferScalarPerVector_NPerBlock>,
         BlkGemmPipeSched,
         BlkGemmPipelineVer,
         ComputeTypeA,
@@ -140,21 +143,24 @@ struct DeviceGemm_BScale_Wmma_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
-    using DeviceGemmCommon = DeviceGemm_Wmma_CShuffleV3_Common<GridwiseGemm,
-                                                               ADataType,
-                                                               BDataType,
-                                                               CDataType,
-                                                               MPerBlock,
-                                                               NPerBlock,
-                                                               KPerBlock,
-                                                               BlockSize,
-                                                               AK1,
-                                                               BK1,
-                                                               GemmSpec,
-                                                               BlkGemmPipeSched,
-                                                               BlkGemmPipelineVer,
-                                                               ComputeTypeA,
-                                                               ComputeTypeB>;
+    using DeviceGemmCommon =
+        DeviceGemm_Wmma_CShuffleV3_Common<GridwiseGemm,
+                                          Tuple<ADataType>,
+                                          Tuple<BDataType>,
+                                          Tuple<>,
+                                          CDataType,
+                                          MPerBlock,
+                                          NPerBlock,
+                                          KPerBlock,
+                                          BlockSize,
+                                          AK1,
+                                          BK1,
+                                          GemmSpec,
+                                          Sequence<CShuffleBlockTransferScalarPerVector_NPerBlock>,
+                                          BlkGemmPipeSched,
+                                          BlkGemmPipelineVer,
+                                          ComputeTypeA,
+                                          ComputeTypeB>;
 
     // Invoker
     using Invoker = typename DeviceGemmCommon::Invoker;
@@ -188,23 +194,25 @@ struct DeviceGemm_BScale_Wmma_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
                              index_t KBatch,
                              AElementwiseOperation a_element_op,
                              BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             CElementwiseOperation cde_element_op)
     {
-        return Argument{p_a,
-                        p_b,
+        return Argument{std::array<const void*, 1>{p_a},
+                        std::array<const void*, 1>{p_b},
+                        std::array<const void*, 0>{}, // p_ds_grid_
                         p_c,
                         M,
                         N,
                         K,
-                        StrideA,
-                        StrideB,
+                        std::array<index_t, 1>{StrideA},
+                        std::array<index_t, 1>{StrideB},
+                        std::array<index_t, 0>{}, // StrideDs_
                         StrideC,
                         StrideScaleB,
                         p_b_scale,
                         KBatch,
                         a_element_op,
                         b_element_op,
-                        c_element_op};
+                        cde_element_op};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -226,14 +234,16 @@ struct DeviceGemm_BScale_Wmma_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
                                                       BElementwiseOperation b_element_op,
                                                       CElementwiseOperation c_element_op) override
     {
-        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
-                                          static_cast<const BDataType*>(p_b),
+        return std::make_unique<Argument>(std::array<const void*, 1>{p_a},
+                                          std::array<const void*, 1>{p_b},
+                                          std::array<const void*, 0>{}, // p_ds_grid_
                                           static_cast<CDataType*>(p_c),
                                           M,
                                           N,
                                           K,
-                                          StrideA,
-                                          StrideB,
+                                          std::array<index_t, 1>{StrideA},
+                                          std::array<index_t, 1>{StrideB},
+                                          std::array<index_t, 0>{}, // StrideDs_
                                           StrideC,
                                           StrideScaleB,
                                           static_cast<const BScaleDataType*>(p_b_scale),
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp
index 24b96a1e60..4269d67d12 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <iostream>
 #include <sstream>
 
@@ -22,9 +23,10 @@ namespace tensor_operation {
 namespace device {
 
 template <typename GridwiseGemm,
-          typename ADataType,
-          typename BDataType,
-          typename CDataType,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t KPerBlock,
@@ -32,6 +34,7 @@ template <typename GridwiseGemm,
           index_t AK1,
           index_t BK1,
           GemmSpecialization GemmSpec,
+          typename CDEShuffleBlockTransferScalarPerVectors,
           BlockGemmPipelineScheduler BlkGemmPipeSched,
           BlockGemmPipelineVersion BlkGemmPipelineVer,
           typename ComputeTypeA,
@@ -85,18 +88,42 @@ struct DeviceGemm_Wmma_CShuffleV3_Common
                 {
                     Argument arg_ = arg;
 
-                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
-                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
-                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
-                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideAs, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideBs, arg_.BK0);
 
-                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
-                                         sizeof(ADataType) / GridwiseGemm::APackedSize;
-                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
-                                         sizeof(BDataType) / GridwiseGemm::BPackedSize;
+                    std::array<std::size_t, GridwiseGemm::NumATensor> size_as_buffers;
+                    static_for<0, GridwiseGemm::NumATensor, 1>{}([&](auto i) {
+                        using ADataType    = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+                        size_as_buffers[i] = a_grid_desc_ak0_m_ak1[i].GetElementSpaceSize() *
+                                             sizeof(ADataType) / GridwiseGemm::APackedSize;
+                    });
 
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    std::array<std::size_t, GridwiseGemm::NumBTensor> size_bs_buffers;
+                    static_for<0, GridwiseGemm::NumBTensor, 1>{}([&](auto i) {
+                        using BDataType    = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+                        size_bs_buffers[i] = b_grid_desc_bk0_n_bk1[i].GetElementSpaceSize() *
+                                             sizeof(BDataType) / GridwiseGemm::BPackedSize;
+                    });
+
+                    const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
+                        arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
+
+                    std::array<std::size_t, GridwiseGemm::NumDTensor> size_ds_buffers;
+                    static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
+                        using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                        size_ds_buffers[i] =
+                            ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
+                    });
+
+                    ck::utility::
+                        RotatingMemWrapperMultiABD<Argument, AsDataType, BsDataType, DsDataType>
+                            rotating_mem(arg_,
+                                         stream_config.rotating_count,
+                                         size_as_buffers,
+                                         size_bs_buffers,
+                                         size_ds_buffers);
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -106,9 +133,9 @@ struct DeviceGemm_Wmma_CShuffleV3_Common
                         rotating_mem.Next();
                         // clear c mem
                         if(arg_.KBatch > 1)
-                            HIP_CHECK_ERROR(hipMemsetAsync(arg_.p_c_grid,
+                            HIP_CHECK_ERROR(hipMemsetAsync(arg_.p_e_grid,
                                                            0,
-                                                           arg_.M * arg_.N * sizeof(CDataType),
+                                                           arg_.M * arg_.N * sizeof(EDataType),
                                                            stream_config.stream_id_));
                     };
 
@@ -124,9 +151,9 @@ struct DeviceGemm_Wmma_CShuffleV3_Common
                 else
                 {
                     if(arg.KBatch > 1)
-                        HIP_CHECK_ERROR(hipMemsetAsync(arg.p_c_grid,
+                        HIP_CHECK_ERROR(hipMemsetAsync(arg.p_e_grid,
                                                        0,
-                                                       arg.M * arg.N * sizeof(CDataType),
+                                                       arg.M * arg.N * sizeof(EDataType),
                                                        stream_config.stream_id_));
 
                     ave_time = launch_and_time_kernel(
@@ -149,6 +176,16 @@ struct DeviceGemm_Wmma_CShuffleV3_Common
                 }
             }();
 
+            // ThreadwiseTensorSliceTransfer_v7r3 (used in ThreadGroupTensorSliceTransfer_v7r3) is
+            // currently implemented in such a way that all SrcScalarPerVectors must be the same, so
+            // if one of D matrices is column-major, then all SrcScalarPerVectors must be 1. On the
+            // other hand, Split K for 16-bit outputs uses packed atomics so ScalarPerVectors cannot
+            // be odd.
+            constexpr bool AtomicsImplementationExists =
+                !(std::is_same_v<EDataType, ck::half_t> || std::is_same_v<EDataType, ck::bhalf_t> ||
+                  std::is_same_v<EDataType, int8_t>) ||
+                (CDEShuffleBlockTransferScalarPerVectors{}[0] % 2 == 0);
+
             if(has_main_k_block_loop)
             {
                 // Tail number always full
@@ -157,12 +194,15 @@ struct DeviceGemm_Wmma_CShuffleV3_Common
                 {
                     if(arg.KBatch > 1)
                     {
-                        const auto kernel =
-                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
-                                                         true,
-                                                         InMemoryDataOperationEnum::AtomicAdd,
-                                                         minimum_occupancy>;
-                        Run(kernel);
+                        if constexpr(AtomicsImplementationExists)
+                        {
+                            const auto kernel =
+                                kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::AtomicAdd,
+                                                             minimum_occupancy>;
+                            Run(kernel);
+                        }
                     }
                     else
                     {
@@ -186,12 +226,15 @@ struct DeviceGemm_Wmma_CShuffleV3_Common
                 {
                     if(arg.KBatch > 1)
                     {
-                        const auto kernel =
-                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
-                                                         false,
-                                                         InMemoryDataOperationEnum::AtomicAdd,
-                                                         minimum_occupancy>;
-                        Run(kernel);
+                        if constexpr(AtomicsImplementationExists)
+                        {
+                            const auto kernel =
+                                kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                             false,
+                                                             InMemoryDataOperationEnum::AtomicAdd,
+                                                             minimum_occupancy>;
+                            Run(kernel);
+                        }
                     }
                     else
                     {
@@ -229,8 +272,8 @@ struct DeviceGemm_Wmma_CShuffleV3_Common
             return false;
         }
 
-        if constexpr(std::is_same_v<CDataType, ck::half_t> ||
-                     std::is_same_v<CDataType, ck::bhalf_t>)
+        if constexpr(std::is_same_v<EDataType, ck::half_t> ||
+                     std::is_same_v<EDataType, ck::bhalf_t>)
         {
             if(arg.KBatch > 1 && ck::is_gfx11_supported())
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp
new file mode 100644
index 0000000000..df51a2aa27
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <sstream>
+#include <type_traits>
+#include <typeinfo>
+#include <memory>
+#include <array>
+#include <stdexcept>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/ck.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ReduceDataType                     = CDataType,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA>
+struct DeviceGemm_Wmma_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
+                                                            BLayout,
+                                                            DsLayout,
+                                                            CLayout,
+                                                            ADataType,
+                                                            BDataType,
+                                                            DsDataType,
+                                                            CDataType,
+                                                            AElementwiseOperation,
+                                                            BElementwiseOperation,
+                                                            CElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3<
+        ALayout,
+        BLayout,
+        Tuple<>,
+        CLayout,
+        Tuple<ADataType>,
+        Tuple<BDataType>,
+        GemmAccDataType,
+        ReduceDataType,
+        Tuple<>,
+        ReduceDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        PassThrough,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        Sequence<CShuffleBlockTransferScalarPerVector_NPerBlock>,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        false,
+        false>;
+
+    struct Argument : public GridwiseGemm::Argument
+    {
+        Argument(std::array<const void*, 1> p_a_grid_,
+                 std::array<const void*, 1> p_b_grid_,
+                 const ::std::array<const void*, NumDTensor> p_ds_,
+                 CDataType* p_c_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 std::array<index_t, 1> StrideA_,
+                 std::array<index_t, 1> StrideB_,
+                 const ::std::array<index_t, NumDTensor> stride_ds_,
+                 index_t StrideC_,
+                 index_t KBatch_,
+                 AElementwiseOperation a_element_op_,
+                 BElementwiseOperation b_element_op_,
+                 CElementwiseOperation c_element_op_)
+            : GridwiseGemm::Argument(p_a_grid_,
+                                     p_b_grid_,
+                                     ::std::array<const void*, 0>{},
+                                     reinterpret_cast<ReduceDataType*>(p_c_grid_),
+                                     M_,
+                                     N_,
+                                     K_,
+                                     StrideA_,
+                                     StrideB_,
+                                     std::array<index_t, 0>{},
+                                     StrideC_,
+                                     KBatch_,
+                                     a_element_op_,
+                                     b_element_op_,
+                                     PassThrough{},
+                                     true),
+              p_c_grid(p_c_grid_),
+              c_element_op(c_element_op_),
+              p_ds(p_ds_),
+              StrideDs(stride_ds_)
+        {
+        }
+
+        CDataType* p_c_grid;
+        CElementwiseOperation c_element_op;
+        const ::std::array<const void*, NumDTensor> p_ds;
+        ::std::array<index_t, NumDTensor> StrideDs;
+    };
+
+    using ReduceAdd               = ck::reduce::Add;
+    using OutElementwiseOperation = CElementwiseOperation;
+
+    static constexpr auto DsVectorLengthSequence = generate_sequence_v2(
+        [](auto i) {
+            using DLayout = ::std::__remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+            if constexpr(is_same<CLayout, DLayout>::value)
+                return Number<CShuffleBlockTransferScalarPerVector_NPerBlock>{};
+            else
+                return Number<1>{};
+        },
+        Number<NumDTensor>{});
+
+    using DeviceReduceInstance = DeviceReduceThreadWiseMultiD<
+        ReduceDataType,  // InDataType
+        DsDataType,      // DsDatatype
+        GemmAccDataType, // AccDataType
+        CDataType,       // OutDataType
+        3,               // Rank
+        1,               // NumReduceDim
+        ReduceAdd,
+        PassThrough,
+        OutElementwiseOperation,
+        256,                                            // BlockSize_
+        CShuffleBlockTransferScalarPerVector_NPerBlock, // MThreadSliceSize_
+        1,                                              // KThreadSliceSize_
+        0,                                              // InSrcVectorDim_
+        CShuffleBlockTransferScalarPerVector_NPerBlock, // InSrcVectorSize_
+        CShuffleBlockTransferScalarPerVector_NPerBlock, // OutDstVectorSize_
+        decltype(DsVectorLengthSequence)>;
+
+    struct Invoker : public BaseInvoker
+    {
+        float RunReduce(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            static constexpr index_t NumInDim  = 3;
+            static constexpr index_t NumOutDim = 2;
+
+            ::std::array<index_t, NumInDim> in_lengths   = {arg.KBatch, arg.M, arg.N};
+            ::std::array<index_t, NumOutDim> out_lengths = {arg.M, arg.N};
+
+            ::std::array<index_t, NumInDim> in_strides;
+            ::std::array<index_t, NumOutDim> out_strides;
+            if constexpr(is_same<CLayout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                in_strides  = {arg.M * arg.N, arg.N, 1};
+                out_strides = {arg.N, 1};
+            }
+            else
+            {
+                in_strides  = {arg.M * arg.N, 1, arg.M};
+                out_strides = {1, arg.M};
+            }
+
+            ::std::array<int, 1> reduce_dims{0};
+
+            ::std::array<::std::array<index_t, NumOutDim>, NumDTensor> DsLengths;
+            ::std::array<::std::array<index_t, NumOutDim>, NumDTensor> DsStrides;
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                DsLengths[i] = out_lengths;
+
+                using DLayout = ::std::__remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                if constexpr(is_same<DLayout, ck::tensor_layout::gemm::RowMajor>::value)
+                {
+                    DsStrides[i] = {arg.StrideDs[i], 1};
+                }
+                else
+                {
+                    DsStrides[i] = {1, arg.StrideDs[i]};
+                }
+            });
+
+            auto reduce = DeviceReduceInstance{};
+
+            auto argument_ptr = reduce.MakeArgumentPointer(in_lengths,
+                                                           in_strides,
+                                                           DsLengths,
+                                                           DsStrides,
+                                                           out_lengths,
+                                                           out_strides,
+                                                           reduce_dims,
+                                                           arg.p_workspace_,
+                                                           arg.p_ds,
+                                                           arg.p_c_grid,
+                                                           PassThrough{},
+                                                           OutElementwiseOperation{});
+
+            auto invoker_ptr = reduce.MakeInvokerPointer();
+
+            float ave_time = 0;
+
+            if(reduce.IsSupportedArgument(argument_ptr.get()))
+            {
+                ave_time = invoker_ptr->Run(argument_ptr.get(), stream_config);
+            }
+            else
+            {
+                throw ::std::runtime_error(
+                    "The runtime parameters are not supported by the device instance.");
+            }
+
+            return ave_time;
+        }
+
+        float Run(const Argument& arg_, const StreamConfig& stream_config = StreamConfig{})
+        {
+            auto arg = *dynamic_cast<const typename GridwiseGemm::Argument*>(&arg_);
+
+            // workspace required when doing two-kernel reduce or Ds present
+            const bool need_workspace = !(!(arg.IsReduceAdd() || NumDTensor > 0) &&
+                                          is_same<CDataType, ReduceDataType>::value);
+            if(need_workspace)
+            {
+                if(arg.p_workspace_ == nullptr)
+                {
+                    throw ::std::runtime_error("using reduce, but empty workspace!");
+                }
+                arg.p_e_grid = reinterpret_cast<ReduceDataType*>(arg.p_workspace_);
+            }
+
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw ::std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            ::std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+
+            if(has_main_k_block_loop)
+            {
+                const auto kernel =
+                    ::ck::kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                       true,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       minimum_occupancy>;
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, ::dim3(gdx, gdy, gdz), ::dim3(BlockSize), 0, arg);
+            }
+            else
+            {
+                const auto kernel =
+                    ::ck::kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                       false,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       minimum_occupancy>;
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, ::dim3(gdx, gdy, gdz), ::dim3(BlockSize), 0, arg);
+            }
+
+            if(need_workspace)
+            {
+                ave_time += RunReduce(arg_, stream_config);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_wmma_supported())
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(
+            *dynamic_cast<const typename GridwiseGemm::Argument*>(&arg));
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return GridwiseGemm::CalculateGridSize(M, N, KBatch);
+    }
+
+    static constexpr index_t GetBlockSize() { return BlockSize; }
+
+    static size_t GetSharedMemoryNumberOfByte()
+    {
+        return GridwiseGemm::GetSharedMemoryNumberOfByte();
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             const ::std::array<const void*, NumDTensor> p_ds,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             const ::std::array<index_t, NumDTensor> stride_ds,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{std::array<const void*, 1>{p_a},
+                        std::array<const void*, 1>{p_b},
+                        p_ds,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        std::array<index_t, 1>{StrideA},
+                        std::array<index_t, 1>{StrideB},
+                        stride_ds,
+                        StrideC,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    ::std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return ::std::make_unique<Invoker>(Invoker{});
+    }
+
+    // Polymorphic interfaces
+    ::std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                        const void* p_b,
+                                                        ::std::array<const void*, NumDTensor> p_ds,
+                                                        void* p_c,
+                                                        index_t M,
+                                                        index_t N,
+                                                        index_t K,
+                                                        index_t StrideA,
+                                                        index_t StrideB,
+                                                        ::std::array<index_t, NumDTensor> DsStrides,
+                                                        index_t StrideC,
+                                                        index_t KSplit,
+                                                        AElementwiseOperation a_element_op,
+                                                        BElementwiseOperation b_element_op,
+                                                        CElementwiseOperation c_element_op) override
+    {
+        return ::std::make_unique<Argument>(std::array<const void*, 1>{p_a},
+                                            std::array<const void*, 1>{p_b},
+                                            p_ds,
+                                            static_cast<CDataType*>(p_c),
+                                            M,
+                                            N,
+                                            K,
+                                            std::array<index_t, 1>{StrideA},
+                                            std::array<index_t, 1>{StrideB},
+                                            DsStrides,
+                                            StrideC,
+                                            KSplit,
+                                            a_element_op,
+                                            b_element_op,
+                                            c_element_op);
+    }
+
+    ::std::string GetTypeString() const override
+    {
+        auto str = ::std::stringstream();
+
+        auto BlkGemmPipelineSchedulerToString = [](BlockGemmPipelineScheduler s) {
+            switch(s)
+            {
+            case BlockGemmPipelineScheduler::Intrawave: return ::std::string("Intrawave");
+            case BlockGemmPipelineScheduler::Interwave: return ::std::string("Interwave");
+            }
+            return ::std::string("?");
+        };
+
+        auto BlkGemmPipelineVersionToString = [](BlockGemmPipelineVersion v) {
+            switch(v)
+            {
+            case BlockGemmPipelineVersion::v1: return ::std::string("v1");
+            case BlockGemmPipelineVersion::v2: return ::std::string("v2");
+            case BlockGemmPipelineVersion::v3: return ::std::string("v3");
+            case BlockGemmPipelineVersion::v4: return ::std::string("v4");
+            case BlockGemmPipelineVersion::v5: return ::std::string("v5");
+            }
+            return ::std::string("v?");
+        };
+
+        // clang-format off
+        str << "DeviceGemmWmmaUniversalReduce"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << ::std::string(ALayout::name)[0]
+            << ::std::string(BLayout::name)[0]
+            << ::std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WmmaTile: "
+            << MPerWmma<<"x"<<NPerWmma << ", "
+            << "WmmaRepeat: "
+            << MRepeat<<"x" << NRepeat<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString(BlkGemmPipeSched) << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString(BlkGemmPipelineVer) << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        auto arg = *dynamic_cast<const Argument*>(p_arg);
+
+        // Need workspace if using split-K or have D tensors
+        if(!(!(arg.IsReduceAdd() || NumDTensor > 0) && is_same<CDataType, ReduceDataType>::value))
+        {
+            return arg.M * arg.N * arg.KBatch * sizeof(ReduceDataType);
+        }
+
+        return 0;
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
index 5188ece333..2e4abe9819 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -69,6 +69,10 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
                                          BElementwiseOperation,
                                          CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -76,7 +80,8 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
     static constexpr auto K1Number = Number<K1>{};
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -96,7 +101,7 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -119,13 +124,17 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
         NumPrefetch,
         LoopSched,
         PipelineVer>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& karg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -160,6 +169,8 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -201,8 +212,22 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(karg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(karg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(karg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
index dd41f4bca0..327b9523b8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -80,12 +80,17 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
 {
     using DeviceOp = DeviceGemm_Xdl_CShuffle;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
         ALayout,
         BLayout,
         CLayout,
@@ -109,7 +114,7 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -134,13 +139,17 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
         PipelineVer,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -177,6 +186,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -193,11 +204,10 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
                                                        GemmSpec == GemmSpecialization::NKPadding ||
                                                        GemmSpec == GemmSpecialization::MNKPadding ||
@@ -206,7 +216,22 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp
index ac2e826725..23b0faec67 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -69,9 +69,14 @@ struct DeviceGemm_Xdl_CShuffle_LdsDirectLoad : public DeviceGemm<ALayout,
                                                                  BElementwiseOperation,
                                                                  CDEElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I1 = Number<1>{};
 
-    using GridwiseGemm = GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad<
         ALayout,
         BLayout,
         ck::Tuple<>,
@@ -98,7 +103,7 @@ struct DeviceGemm_Xdl_CShuffle_LdsDirectLoad : public DeviceGemm<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferSrcAccessOrder,
         ABlockTransferSrcVectorDim,
@@ -114,13 +119,19 @@ struct DeviceGemm_Xdl_CShuffle_LdsDirectLoad : public DeviceGemm<ALayout,
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched,
-        PipelineVer>;
+        PipelineVer,
+        ComputeDataType>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
+
+    using Argument = typename GridwiseGemm64::Argument;
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                             arg.b_grid_desc_n_k_,
@@ -185,6 +196,8 @@ struct DeviceGemm_Xdl_CShuffle_LdsDirectLoad : public DeviceGemm<ALayout,
             }
         }
 
+        INVOKER_RUN3_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -194,14 +207,21 @@ struct DeviceGemm_Xdl_CShuffle_LdsDirectLoad : public DeviceGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(!ck::is_lds_direct_load_supported())
         {
             return false;
         }
 
-        if(!ck::is_lds_direct_load_supported())
+        if constexpr(is_same_v<ComputeDataType, ck::tf32_t>)
         {
-            return false;
+            if(!is_tf32_supported())
+            {
+                return false;
+            }
         }
 
         // Check vector load/store.
@@ -264,11 +284,29 @@ struct DeviceGemm_Xdl_CShuffle_LdsDirectLoad : public DeviceGemm<ALayout,
             }
         }
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                           arg.b_grid_desc_n_k_,
-                                           arg.ds_grid_desc_m_n_,
-                                           arg.e_grid_desc_m_n_,
-                                           arg.block_2_etile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.ds_grid_desc_m_n_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
index 3171208830..c42228369f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -75,8 +75,13 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
                                                                          BElementwiseOperation,
                                                                          CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_streamk_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_streamk_v3<
         ALayout,
         BLayout,
         CLayout,
@@ -98,7 +103,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -123,13 +128,18 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
         BlkGemmPipelineVer,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
+    //
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
 
             if(stream_config.log_level_ > 0)
@@ -176,8 +186,8 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
 
                 if(stream_config.flush_cache)
                 {
-                    Argument arg_ = arg;
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                    auto arg_ = arg;
+                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
                         arg_,
                         stream_config.rotating_count,
                         arg_.M * arg_.K * sizeof(ADataType),
@@ -426,6 +436,8 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -464,7 +476,12 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        // gfx11 doesn't support float atomic
+        if(ck::is_gfx11_supported())
+        {
+            return false;
+        }
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
@@ -481,7 +498,22 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -489,30 +521,30 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
     {
         return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
     }
-
+    template <typename GridwiseGemm, bool IsValid>
     static auto
-    MakeArgument(const ADataType* p_a,
-                 const BDataType* p_b,
-                 CDataType* p_c,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 index_t streamk_sel,
-                 index_t Grid_size,
-                 AElementwiseOperation,
-                 BElementwiseOperation,
-                 CElementwiseOperation,
-                 StreamKReductionStrategy reduction_strategy = StreamKReductionStrategy::Atomic)
+    MakeArgumentImp(const ADataType* p_a,
+                    const BDataType* p_b,
+                    CDataType* p_c,
+                    index_t M,
+                    index_t N,
+                    index_t K,
+                    index_t StrideA,
+                    index_t StrideB,
+                    index_t StrideC,
+                    index_t streamk_sel,
+                    index_t Grid_size,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    StreamKReductionStrategy reduction_strategy = StreamKReductionStrategy::Atomic)
     {
 
         constexpr index_t minimum_occupancy =
             BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
-        index_t K_split                  = (K + KPerBlock - 1) / KPerBlock * KPerBlock;
-        const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
-        int occupancy, num_cu;
+        index_t K_split = (K + KPerBlock - 1) / KPerBlock * KPerBlock;
+
+        int occupancy = 1, num_cu = 1;
         const auto calculate_grid_size = [&](const auto& kernel) {
             hip_check_error(
                 hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, BlockSize, 0));
@@ -524,185 +556,193 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
             Grid_size = num_cu * occupancy;
         };
 
-        if(has_main_k_block_loop)
+        if constexpr(IsValid)
         {
-            // Tail number always full
-            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
-                         BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+            if(has_main_k_block_loop)
             {
-
-                const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy>;
-                calculate_grid_size(kernel);
-            }
-            // Tail number could be One to Seven
-            else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-            {
-
-                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                 {
+
                     const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
                                                                     true,
                                                                     InMemoryDataOperationEnum::Set,
-                                                                    minimum_occupancy,
-                                                                    TailNumber::One>;
+                                                                    minimum_occupancy>;
                     calculate_grid_size(kernel);
                 }
-                else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full)
+                // Tail number could be One to Seven
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
                 {
-                    const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                    true,
-                                                                    InMemoryDataOperationEnum::Set,
-                                                                    minimum_occupancy,
-                                                                    TailNumber::Full>;
-                    calculate_grid_size(kernel);
-                }
 
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
                     {
                         const auto kernel =
                             kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
                                                         true,
                                                         InMemoryDataOperationEnum::Set,
                                                         minimum_occupancy,
-                                                        TailNumber::Two>;
+                                                        TailNumber::One>;
                         calculate_grid_size(kernel);
                     }
-                }
-
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Three)
+                    else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full)
                     {
                         const auto kernel =
                             kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
                                                         true,
                                                         InMemoryDataOperationEnum::Set,
                                                         minimum_occupancy,
-                                                        TailNumber::Three>;
+                                                        TailNumber::Full>;
                         calculate_grid_size(kernel);
                     }
-                }
 
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Two>;
+                            calculate_grid_size(kernel);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Three)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Three>;
+                            calculate_grid_size(kernel);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Four)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Four>;
+                            calculate_grid_size(kernel);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Five)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Five>;
+                            calculate_grid_size(kernel);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Six>;
+                            calculate_grid_size(kernel);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Seven)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Seven>;
+                            calculate_grid_size(kernel);
+                        }
+                    }
+                }
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
                 {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Four)
+
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
                     {
                         const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        true,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy,
-                                                        TailNumber::Four>;
+                            kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             minimum_occupancy,
+                                                             TailNumber::Odd>;
                         calculate_grid_size(kernel);
                     }
-                }
-
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Five)
+                    else
                     {
                         const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        true,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy,
-                                                        TailNumber::Five>;
+                            kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             minimum_occupancy,
+                                                             TailNumber::Even>;
                         calculate_grid_size(kernel);
                     }
                 }
-
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
-                    {
-                        const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        true,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy,
-                                                        TailNumber::Six>;
-                        calculate_grid_size(kernel);
-                    }
-                }
-
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Seven)
-                    {
-                        const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        true,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy,
-                                                        TailNumber::Seven>;
-                        calculate_grid_size(kernel);
-                    }
-                }
-            }
-            // Tail number could be Odd or Even
-            else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
-            {
-
-                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                {
-                    const auto kernel =
-                        kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
-                                                         true,
-                                                         InMemoryDataOperationEnum::Set,
-                                                         minimum_occupancy,
-                                                         TailNumber::Odd>;
-                    calculate_grid_size(kernel);
-                }
                 else
                 {
-                    const auto kernel =
-                        kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
-                                                         true,
-                                                         InMemoryDataOperationEnum::Set,
-                                                         minimum_occupancy,
-                                                         TailNumber::Even>;
-                    calculate_grid_size(kernel);
+
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Odd>;
+                        calculate_grid_size(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Even>;
+                        calculate_grid_size(kernel);
+                    }
                 }
             }
             else
             {
-
-                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
                 {
+
                     const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                    true,
+                                                                    false,
                                                                     InMemoryDataOperationEnum::Set,
-                                                                    minimum_occupancy,
-                                                                    TailNumber::Odd>;
+                                                                    minimum_occupancy>;
                     calculate_grid_size(kernel);
                 }
-                else
-                {
-                    const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                    true,
-                                                                    InMemoryDataOperationEnum::Set,
-                                                                    minimum_occupancy,
-                                                                    TailNumber::Even>;
-                    calculate_grid_size(kernel);
-                }
-            }
-        }
-        else
-        {
-            // Tail number always 1
-            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
-            {
-
-                const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                false,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy>;
-                calculate_grid_size(kernel);
             }
         }
 
@@ -720,6 +760,62 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
                         reduction_strategy};
     }
 
+    static auto
+    MakeArgument(const ADataType* p_a,
+                 const BDataType* p_b,
+                 CDataType* p_c,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t streamk_sel,
+                 index_t Grid_size,
+                 AElementwiseOperation a_op,
+                 BElementwiseOperation b_op,
+                 CElementwiseOperation c_op,
+                 StreamKReductionStrategy reduction_strategy = StreamKReductionStrategy::Atomic)
+    {
+        if(get_warp_size() == 64)
+        {
+            constexpr bool IsValid = NXdlPerWave64 > 0;
+            return MakeArgumentImp<GridwiseGemm64, IsValid>(p_a,
+                                                            p_b,
+                                                            p_c,
+                                                            M,
+                                                            N,
+                                                            K,
+                                                            StrideA,
+                                                            StrideB,
+                                                            StrideC,
+                                                            streamk_sel,
+                                                            Grid_size,
+                                                            a_op,
+                                                            b_op,
+                                                            c_op,
+                                                            reduction_strategy);
+        }
+        else
+        {
+            constexpr bool IsValid = NXdlPerWave32 > 0;
+            return MakeArgumentImp<GridwiseGemm32, IsValid>(p_a,
+                                                            p_b,
+                                                            p_c,
+                                                            M,
+                                                            N,
+                                                            K,
+                                                            StrideA,
+                                                            StrideB,
+                                                            StrideC,
+                                                            streamk_sel,
+                                                            Grid_size,
+                                                            a_op,
+                                                            b_op,
+                                                            c_op,
+                                                            reduction_strategy);
+        }
+    }
     static auto MakeInvoker() { return Invoker{}; }
 
     // polymorphic
@@ -799,7 +895,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp
index d49c63f147..43b523997d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -79,13 +79,17 @@ struct DeviceGemm_Xdl_CShuffleV2 : public DeviceGemm<ALayout,
                                                      CElementwiseOperation>
 {
     using DeviceOp = DeviceGemm_Xdl_CShuffleV2;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v2<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_v2<
         ALayout,
         BLayout,
         CLayout,
@@ -109,7 +113,7 @@ struct DeviceGemm_Xdl_CShuffleV2 : public DeviceGemm<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -134,13 +138,17 @@ struct DeviceGemm_Xdl_CShuffleV2 : public DeviceGemm<ALayout,
         PipelineVer,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -174,6 +182,8 @@ struct DeviceGemm_Xdl_CShuffleV2 : public DeviceGemm<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -190,11 +200,10 @@ struct DeviceGemm_Xdl_CShuffleV2 : public DeviceGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
                                                        GemmSpec == GemmSpecialization::NKPadding ||
                                                        GemmSpec == GemmSpecialization::MNKPadding ||
@@ -203,7 +212,22 @@ struct DeviceGemm_Xdl_CShuffleV2 : public DeviceGemm<ALayout,
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 1cb82d24eb..d100d96583 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -176,31 +176,8 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                                                        BElementwiseOperation,
                                                        CElementwiseOperation>
 {
-    template <bool isWave64>
-    static constexpr auto GetNXdlPerWave()
-    {
-        constexpr index_t Waves  = isWave64 ? BlockSize / 64 : BlockSize / 32;
-        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXDL);
-        static_assert(MWaves > 0);
-
-        constexpr index_t NWaves = Waves / MWaves;
-        if constexpr(NWaves == 0)
-        {
-            return 0;
-        }
-        else
-        {
-            if constexpr(NPerBlock % (NPerXDL * NWaves) == 0)
-            {
-                return NPerBlock / (NWaves * NPerXDL);
-            }
-            else
-            {
-                return 0;
-            }
-        }
-    }
     // GridwiseGemm
+    GET_NXDL_PER_WAVE_IMPL
     static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
     static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
@@ -284,6 +261,11 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
     ///
     struct Invoker : public BaseInvoker
     {
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
         template <typename GridwiseGemm>
         float RunImp(const typename GridwiseGemm::Argument& arg,
                      const StreamConfig& stream_config = StreamConfig{})
@@ -760,31 +742,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
             return ave_time;
         }
 
-        /// @brief  This function issues GPU kernel execution.
-        /// @param arg           The GPU kernel arguments.
-        /// @param stream_config The HIP stream configuration helper structure.
-        /// @return              The kernel's average execution time (if time measurement is
-        ///                      enabled).
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            if(get_warp_size() == 64)
-            {
-                if constexpr(NXdlPerWave64 > 0)
-                {
-                    return RunImp<GridwiseGemm64>(arg, stream_config);
-                }
-            }
-            else
-            {
-                if constexpr(NXdlPerWave32 > 0)
-                {
-                    return RunImp<GridwiseGemm32>(
-                        reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg),
-                        stream_config);
-                }
-            }
-            return 0;
-        }
+        INVOKER_RUN3_IMPL
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -801,11 +759,10 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if(arg.KBatch > 1)
         {
             if(is_gfx11_supported())
@@ -824,14 +781,6 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
             }
         }
 
-        if(is_gfx11_supported() || is_gfx12_supported())
-        {
-            if(MPerXDL != 16 || NPerXDL != 16)
-            {
-                return false;
-            }
-        }
-
         if(is_gfx11_supported())
         {
             if constexpr(std::is_same_v<ADataType, ck::f8_t> ||
@@ -855,10 +804,6 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
             {
                 return GridwiseGemm64::CheckValidity(arg);
             }
-            else
-            {
-                return false;
-            }
         }
         else
         {
@@ -867,11 +812,8 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                 return GridwiseGemm32::CheckValidity(
                     reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
             }
-            else
-            {
-                return false;
-            }
         }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index faa235be50..ebd168a7d0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -77,8 +77,13 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
                                                                               BElementwiseOperation,
                                                                               CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3_b_preshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_v3_b_preshuffle<
         ALayout,
         BLayout,
         CLayout,
@@ -100,7 +105,7 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -127,8 +132,10 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
         ComputeTypeB,
         PermuteA,
         PermuteB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     static constexpr index_t APackedSize = []() {
         if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
@@ -149,7 +156,9 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -175,7 +184,7 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -187,7 +196,7 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
                     auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
                                          sizeof(BDataType) / BPackedSize;
 
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
                     rotating_mem.Print();
 
@@ -377,6 +386,8 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -393,11 +404,14 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(is_gfx11_supported() && arg.KBatch > 1)
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -411,7 +425,22 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -516,9 +545,9 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages << ", "
             << "Kpack: "
-            << GridwiseGemm::BlockwiseGemmPipe::AMmaKStride;
+            << GridwiseGemm64::BlockwiseGemmPipe::AMmaKStride;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
index 456e5e90d1..6e213e60ee 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -84,8 +84,13 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
                                                              BElementwiseOperation,
                                                              CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_v3<
         ALayout,
         BLayout,
         CLayout,
@@ -109,7 +114,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -136,8 +141,10 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
         ComputeTypeB,
         PermuteA,
         PermuteB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     static constexpr index_t APackedSize = []() {
         if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
@@ -156,7 +163,9 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -181,7 +190,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -193,7 +202,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
                     auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
                                          sizeof(BDataType) / BPackedSize;
 
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
                     rotating_mem.Print();
 
@@ -622,6 +631,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -638,7 +648,12 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+
+        if(is_gfx11_supported() && arg.KBatch > 1)
         {
             return false;
         }
@@ -655,8 +670,23 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
         {
             return false;
         }
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
 
-        return GridwiseGemm::CheckValidity(arg);
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -783,7 +813,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
index ae9b75cb0d..f18991e77c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -164,116 +164,131 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
                                                          BElementwiseOperation,
                                                          CElementwiseOperation>
 {
-    // GridwiseGemm
-    using GridwiseGemm = conditional_t< //
-        !is_same_v<BLayout, tensor_layout::gemm::MFMA>,
-        GridwiseGemmMX_xdl_cshuffle_v3<
-            ALayout,
-            BLayout,
-            CLayout,
-            ADataType,
-            AScaleDataType,
-            BDataType,
-            BScaleDataType,
-            GemmAccDataType,
-            CShuffleDataType,
-            CDataType,
-            AElementwiseOperation,
-            BElementwiseOperation,
-            CElementwiseOperation,
-            GemmSpec,
-            ScaleBlockSize,
-            BlockSize,
-            MPerBlock,
-            NPerBlock,
-            KPerBlock,
-            AK1,
-            BK1,
-            MPerXDL,
-            NPerXDL,
-            MXdlPerWave,
-            NXdlPerWave,
-            ABlockTransferThreadClusterLengths_AK0_M_AK1,
-            ABlockTransferThreadClusterArrangeOrder,
-            ABlockTransferSrcAccessOrder,
-            ABlockTransferSrcVectorDim,
-            ABlockTransferSrcScalarPerVector,
-            ABlockTransferDstScalarPerVector_AK1,
-            false,
-            ABlockLdsExtraM,
-            BBlockTransferThreadClusterLengths_BK0_N_BK1,
-            BBlockTransferThreadClusterArrangeOrder,
-            BBlockTransferSrcAccessOrder,
-            BBlockTransferSrcVectorDim,
-            BBlockTransferSrcScalarPerVector,
-            BBlockTransferDstScalarPerVector_BK1,
-            false,
-            BBlockLdsExtraN,
-            CShuffleMXdlPerWavePerShuffle,
-            CShuffleNXdlPerWavePerShuffle,
-            CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-            CShuffleBlockTransferScalarPerVector_NPerBlock,
-            BlkGemmPipeSched,
-            BlkGemmPipelineVer,
-            ComputeTypeA,
-            ComputeTypeB>,
-        GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle<
-            ALayout,
-            BLayout,
-            CLayout,
-            ADataType,
-            AScaleDataType,
-            BDataType,
-            BScaleDataType,
-            GemmAccDataType,
-            CShuffleDataType,
-            CDataType,
-            AElementwiseOperation,
-            BElementwiseOperation,
-            CElementwiseOperation,
-            GemmSpec,
-            ScaleBlockSize,
-            BlockSize,
-            MPerBlock,
-            NPerBlock,
-            KPerBlock,
-            AK1,
-            BK1,
-            MPerXDL,
-            NPerXDL,
-            MXdlPerWave,
-            NXdlPerWave,
-            ABlockTransferThreadClusterLengths_AK0_M_AK1,
-            ABlockTransferThreadClusterArrangeOrder,
-            ABlockTransferSrcAccessOrder,
-            ABlockTransferSrcVectorDim,
-            ABlockTransferSrcScalarPerVector,
-            ABlockTransferDstScalarPerVector_AK1,
-            false,
-            ABlockLdsExtraM,
-            BBlockTransferThreadClusterLengths_BK0_N_BK1,
-            BBlockTransferThreadClusterArrangeOrder,
-            BBlockTransferSrcAccessOrder,
-            BBlockTransferSrcVectorDim,
-            BBlockTransferSrcScalarPerVector,
-            BBlockTransferDstScalarPerVector_BK1,
-            false,
-            BBlockLdsExtraN,
-            CShuffleMXdlPerWavePerShuffle,
-            CShuffleNXdlPerWavePerShuffle,
-            CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-            CShuffleBlockTransferScalarPerVector_NPerBlock,
-            BlkGemmPipeSched,
-            BlkGemmPipelineVer,
-            ComputeTypeA,
-            ComputeTypeB>>;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
-    using Argument = typename GridwiseGemm::Argument;
+    // GridwiseGemm
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmMXBase = GridwiseGemmMX_xdl_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        AScaleDataType,
+        BDataType,
+        BScaleDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        ScaleBlockSize,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave_,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB>;
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmMXBPreshuffleBase = GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        AScaleDataType,
+        BDataType,
+        BScaleDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        ScaleBlockSize,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave_,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB>;
+
+    using GridwiseGemm64 = conditional_t< //
+        !is_same_v<BLayout, tensor_layout::gemm::MFMA>,
+        GridwiseGemmMXBase<math::max(NXdlPerWave64, 1)>,
+        GridwiseGemmMXBPreshuffleBase<math::max(NXdlPerWave64, 1)>>;
+    using GridwiseGemm32 = conditional_t< //
+        !is_same_v<BLayout, tensor_layout::gemm::MFMA>,
+        GridwiseGemmMXBase<NXdlPerWave32>,
+        GridwiseGemmMXBPreshuffleBase<NXdlPerWave32>>;
+
+    using Argument = typename GridwiseGemm64::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -299,7 +314,7 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -311,7 +326,7 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
                     auto size_b_buffer =
                         b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
 
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
                     rotating_mem.Print();
 
@@ -401,6 +416,7 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -447,7 +463,22 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -578,9 +609,9 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages << ", "
             << "Kpack: "
-            << GridwiseGemm::BlockwiseGemmPipe::AMmaKStride << ", "
+            << GridwiseGemm64::BlockwiseGemmPipe::AMmaKStride << ", "
             << "ScaleBlockSize: "
             << ScaleBlockSize;
         // clang-format on
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3r1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3r1.hpp
index 04a7d706c9..d3a416d200 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -85,12 +85,17 @@ struct DeviceGemm_Xdl_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
                                                            BElementwiseOperation,
                                                            CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_v3<
         ALayout,
         BLayout,
         CLayout,
@@ -112,7 +117,7 @@ struct DeviceGemm_Xdl_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -137,8 +142,10 @@ struct DeviceGemm_Xdl_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
         BlkGemmPipelineVer,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    struct Argument : public GridwiseGemm::Argument
+    struct Argument : public GridwiseGemm64::Argument
     {
         Argument(const ADataType* p_a_grid_,
                  const BDataType* p_b_grid_,
@@ -152,17 +159,17 @@ struct DeviceGemm_Xdl_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
                  std::array<ck::index_t, NumDTensor> StrideDs_,
                  index_t StrideC_,
                  index_t k_batch_)
-            : GridwiseGemm::Argument(p_a_grid_,
-                                     p_b_grid_,
-                                     reinterpret_cast<ReduceDataType*>(p_c_grid_),
-                                     M_,
-                                     N_,
-                                     K_,
-                                     StrideA_,
-                                     StrideB_,
-                                     StrideC_,
-                                     k_batch_,
-                                     true),
+            : GridwiseGemm64::Argument(p_a_grid_,
+                                       p_b_grid_,
+                                       reinterpret_cast<ReduceDataType*>(p_c_grid_),
+                                       M_,
+                                       N_,
+                                       K_,
+                                       StrideA_,
+                                       StrideB_,
+                                       StrideC_,
+                                       k_batch_,
+                                       true),
               p_ds(p_ds_),
               StrideDs(StrideDs_)
         {
@@ -278,9 +285,10 @@ struct DeviceGemm_Xdl_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
             return ave_time;
         }
 
-        float Run(const Argument& arg_, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg_, const StreamConfig& stream_config = StreamConfig{})
         {
-            auto arg = *dynamic_cast<const typename GridwiseGemm::Argument*>(&arg_);
+            auto arg = *reinterpret_cast<const typename GridwiseGemm::Argument*>(&arg_);
 
             if(!(!(arg.IsReduceAdd() || NumDTensor > 0) &&
                  std::is_same<CDataType, ReduceDataType>::value))
@@ -542,6 +550,8 @@ struct DeviceGemm_Xdl_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -558,11 +568,10 @@ struct DeviceGemm_Xdl_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
                                                        GemmSpec == GemmSpecialization::NKPadding ||
                                                        GemmSpec == GemmSpecialization::MNKPadding ||
@@ -571,7 +580,22 @@ struct DeviceGemm_Xdl_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -677,7 +701,7 @@ struct DeviceGemm_Xdl_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
index 213501468a..d8b31695f6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
@@ -81,6 +81,10 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
 {
     using DeviceOp = DeviceGemmLayerNorm_Xdl_CShuffle;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -380,7 +384,8 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
     using C0GridDesc_N        = decltype(MakeGridDescriptor_N(1));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -406,7 +411,7 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -424,14 +429,16 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
         false,
         BBlockLdsExtraN,
         CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
+        NXdlPerWave_,
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CShuffleBlockTransferScalarPerVector_NPerBlock,
         CReduceThreadClusterLengths_MPerBlock_NPerBlock,
         CReduceThreadCopySrcDstScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Block2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
+    using Block2CTileMap = typename GridwiseGemm64::DefaultBlock2CTileMap;
 
     // Argument
     struct Argument : public BaseArgument
@@ -464,26 +471,12 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
               b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
               c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
               c0_grid_desc_n_{MakeGridDescriptor_N(NRaw)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              c0_grid_desc_nblock_nperblock_{},
               block_2_ctile_map_{Block2CTileMap(c_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               acc_element_op_{acc_element_op},
               c_element_op_{c_element_op}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-
-                c0_grid_desc_nblock_nperblock_ =
-                    GridwiseGemm::MakeC0GridDescriptor_NBlock_NPerBlock(c0_grid_desc_n_);
-            }
         }
 
         //  private:
@@ -498,9 +491,6 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
         CGridDesc_M_N c_grid_desc_m_n_;
         C0GridDesc_N c0_grid_desc_n_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock_;
         Block2CTileMap block_2_ctile_map_;
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
@@ -513,7 +503,8 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
@@ -538,7 +529,12 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
+            auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c_grid_desc_m_n_);
 
+            auto c0_grid_desc_nblock_nperblock =
+                GridwiseGemm::MakeC0GridDescriptor_NBlock_NPerBlock(arg.c0_grid_desc_n_);
             const index_t grid_size =
                 arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
 
@@ -565,28 +561,27 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
                     Block2CTileMap,
                     true>;
 
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.p_c0_grid_bias_,
-                                           arg.p_c0_grid_add_,
-                                           arg.p_c0_grid_gamma_,
-                                           arg.p_c0_grid_beta_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.acc_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.c0_grid_desc_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.p_c0_grid_bias_,
+                                                  arg.p_c0_grid_add_,
+                                                  arg.p_c0_grid_gamma_,
+                                                  arg.p_c0_grid_beta_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.acc_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.a_grid_desc_ak0_m_ak1_,
+                                                  arg.b_grid_desc_bk0_n_bk1_,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  c0_grid_desc_nblock_nperblock,
+                                                  arg.block_2_ctile_map_);
             }
             else
             {
@@ -605,33 +600,33 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
                     typename GridwiseGemm::C0GridDescriptor_NBlock_NPerBlock,
                     Block2CTileMap,
                     false>;
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.p_c0_grid_bias_,
-                                           arg.p_c0_grid_add_,
-                                           arg.p_c0_grid_gamma_,
-                                           arg.p_c0_grid_beta_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.acc_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.c0_grid_desc_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.p_c0_grid_bias_,
+                                                  arg.p_c0_grid_add_,
+                                                  arg.p_c0_grid_gamma_,
+                                                  arg.p_c0_grid_beta_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.acc_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.a_grid_desc_ak0_m_ak1_,
+                                                  arg.b_grid_desc_bk0_n_bk1_,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  c0_grid_desc_nblock_nperblock,
+                                                  arg.block_2_ctile_map_);
             }
 
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -648,15 +643,31 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                                     arg.b_grid_desc_bk0_n_bk1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
index 7315fe75a3..4abd14b080 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
@@ -63,6 +63,10 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
                                                  BElementwiseOperation,
                                                  CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -188,7 +192,8 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
     using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
@@ -207,7 +212,7 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -222,6 +227,8 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
         Sequence<0, 2, 4, 5, 6, 1, 3, 7>, // CThreadTransferSrcDstAccessOrder,
         CThreadTransferSrcDstVectorDim,
         CThreadTransferDstScalarPerVector>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -246,8 +253,6 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
               a_grid_desc_k0_m_k1_{},
               b_grid_desc_k0_n_k1_{},
               c_grid_desc_m_n_{},
-              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
-              block_2_ctile_map_{},
               M01_{M01},
               N01_{N01},
               a_element_op_{a_element_op},
@@ -259,19 +264,6 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
             b_grid_desc_k0_n_k1_ =
                 DeviceGemmXdlSkipBLds::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
             c_grid_desc_m_n_ = DeviceGemmXdlSkipBLds::MakeCGridDescriptor_M_N(M, N, StrideC);
-
-            if(GridwiseGemm::CheckValidity(
-                   a_grid_desc_k0_m_k1_, b_grid_desc_k0_n_k1_, c_grid_desc_m_n_, M01_, N01_))
-            {
-                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
-                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
-
-                block_2_ctile_map_ =
-                    GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
-
-                b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3_ =
-                    GridwiseGemm::MakeBGridDescriptor_K0_K1_K2_N0_N1_N2_N3_K3(b_grid_desc_k0_n_k1_);
-            }
         }
 
         //  private:
@@ -281,11 +273,6 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
         AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
         BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
         CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3
-            b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3_;
-        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
-            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
         index_t M01_;
         index_t N01_;
         AElementwiseOperation a_element_op_;
@@ -298,7 +285,8 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
     {
         using Argument = DeviceGemmXdlSkipBLds::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
@@ -323,7 +311,8 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
                 throw std::runtime_error(
                     "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
             }
-
+            auto block_2_ctile_map =
+                GridwiseGemm::MakeDefaultBlock2CTileMap(arg.c_grid_desc_m_n_, arg.M01_, arg.N01_);
             const index_t grid_size = GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_);
 
             const auto K0 = arg.a_grid_desc_k0_m_k1_.GetLength(I0);
@@ -340,8 +329,7 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
                     CDataType,
                     remove_reference_t<DeviceGemmXdlSkipBLds::AGridDesc_K0_M_K1>,
                     remove_reference_t<DeviceGemmXdlSkipBLds::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceGemmXdlSkipBLds::CGridDesc_M_N>,
                     AElementwiseOperation,
                     BElementwiseOperation,
                     CElementwiseOperation,
@@ -357,12 +345,12 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
                                                   arg.p_b_grid_,
                                                   arg.p_c_grid_,
                                                   arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m_n_,
                                                   arg.a_element_op_,
                                                   arg.b_element_op_,
                                                   arg.c_element_op_,
-                                                  arg.block_2_ctile_map_);
+                                                  block_2_ctile_map);
             }
             else
             {
@@ -372,8 +360,7 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
                     CDataType,
                     remove_reference_t<DeviceGemmXdlSkipBLds::AGridDesc_K0_M_K1>,
                     remove_reference_t<DeviceGemmXdlSkipBLds::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceGemmXdlSkipBLds::CGridDesc_M_N>,
                     AElementwiseOperation,
                     BElementwiseOperation,
                     CElementwiseOperation,
@@ -389,17 +376,19 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
                                                   arg.p_b_grid_,
                                                   arg.p_c_grid_,
                                                   arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m_n_,
                                                   arg.a_element_op_,
                                                   arg.b_element_op_,
                                                   arg.c_element_op_,
-                                                  arg.block_2_ctile_map_);
+                                                  block_2_ctile_map);
             }
 
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -416,16 +405,33 @@ struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.M01_,
-                                           arg.N01_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                                     arg.b_grid_desc_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.M01_,
+                                                     arg.N01_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                                     arg.b_grid_desc_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.M01_,
+                                                     arg.N01_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
index 2666051c86..ef4593c320 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -75,6 +75,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                                              CElementwiseOperation,
                                                              ComputeType>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -86,7 +90,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     using ComputeTypeA = ComputeType;
     using ComputeTypeB = ComputeType;
 
-    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
         BlockSize,
         ADataType,
         BDataType,
@@ -107,7 +112,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -134,8 +139,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         ComputeTypeB,
         LDSTypeA,
         LDSTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    struct Argument : public GridwiseGemm::Argument
+    struct Argument : public GridwiseGemm64::Argument
     {
         Argument(const ADataType* p_a_grid_,
                  const BDataType* p_b_grid_,
@@ -154,20 +161,20 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                  AElementwiseOperation a_element_op_,
                  BElementwiseOperation b_element_op_,
                  CElementwiseOperation c_element_op_)
-            : GridwiseGemm::Argument(p_a_grid_,
-                                     p_b_grid_,
-                                     p_c_grid_,
-                                     M_,
-                                     N_,
-                                     K_,
-                                     StrideA_,
-                                     StrideB_,
-                                     StrideC_,
-                                     MPadded_,
-                                     NPadded_,
-                                     KPadded_,
-                                     K0Padded_,
-                                     k_batch_),
+            : GridwiseGemm64::Argument(p_a_grid_,
+                                       p_b_grid_,
+                                       p_c_grid_,
+                                       M_,
+                                       N_,
+                                       K_,
+                                       StrideA_,
+                                       StrideB_,
+                                       StrideC_,
+                                       MPadded_,
+                                       NPadded_,
+                                       KPadded_,
+                                       K0Padded_,
+                                       k_batch_),
               a_element_op(a_element_op_),
               b_element_op(b_element_op_),
               c_element_op(c_element_op_)
@@ -179,7 +186,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         CElementwiseOperation c_element_op;
     };
 
-    using DefaultBlock2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
+    using DefaultBlock2CTileMap = typename GridwiseGemm64::DefaultBlock2CTileMap;
 
     // Invoker
     struct Invoker : public BaseInvoker
@@ -187,15 +194,29 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
 
         void Print(const Argument& karg) { karg.Print(); }
 
-        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
-                Print(karg);
+                Print(arg);
             }
 
+            typename GridwiseGemm::Argument karg(arg.p_a_grid,
+                                                 arg.p_b_grid,
+                                                 arg.p_c_grid,
+                                                 arg.M,
+                                                 arg.N,
+                                                 arg.K,
+                                                 arg.StrideA,
+                                                 arg.StrideB,
+                                                 arg.StrideC,
+                                                 arg.MPadded,
+                                                 arg.NPadded,
+                                                 arg.KPadded,
+                                                 arg.K0Padded,
+                                                 arg.k_batch);
             const auto kbatch = karg.k_batch;
-
             if(!GridwiseGemm::CheckValidity(karg))
             {
                 throw std::runtime_error(
@@ -227,9 +248,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                            0,
                                            static_cast<typename GridwiseGemm::Argument>(karg),
                                            b2c_map,
-                                           karg.a_element_op,
-                                           karg.b_element_op,
-                                           karg.c_element_op);
+                                           arg.a_element_op,
+                                           arg.b_element_op,
+                                           arg.c_element_op);
             };
 
             if(has_main_k0_block_loop)
@@ -294,6 +315,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -310,12 +333,31 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
 
     static bool IsSupportedArgument(const Argument& karg)
     {
-        if(!ck::is_xdl_supported())
+        // gfx11 doesn't support float atomic
+        if(ck::is_gfx11_supported())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(karg);
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(karg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(karg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -347,10 +389,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                         StrideA,
                         StrideB,
                         StrideC,
-                        GridwiseGemm::CalculateMPadded(M),
-                        GridwiseGemm::CalculateNPadded(N),
-                        GridwiseGemm::CalculateKPadded(K, KBatch),
-                        GridwiseGemm::CalculateK0Padded(K, KBatch),
+                        GridwiseGemm64::CalculateMPadded(M),
+                        GridwiseGemm64::CalculateNPadded(N),
+                        GridwiseGemm64::CalculateKPadded(K, KBatch),
+                        GridwiseGemm64::CalculateK0Padded(K, KBatch),
                         KBatch,
                         a_element_op,
                         b_element_op,
@@ -383,10 +425,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                           StrideA,
                                           StrideB,
                                           StrideC,
-                                          GridwiseGemm::CalculateMPadded(M),
-                                          GridwiseGemm::CalculateNPadded(N),
-                                          GridwiseGemm::CalculateKPadded(K, KBatch),
-                                          GridwiseGemm::CalculateK0Padded(K, KBatch),
+                                          GridwiseGemm64::CalculateMPadded(M),
+                                          GridwiseGemm64::CalculateNPadded(N),
+                                          GridwiseGemm64::CalculateKPadded(K, KBatch),
+                                          GridwiseGemm64::CalculateK0Padded(K, KBatch),
                                           KBatch,
                                           a_element_op,
                                           b_element_op,
@@ -410,7 +452,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
                                                                        {PipelineVersion::v2, "v2"}};
 
-        str << GridwiseGemm::GetTypeString() << " LoopScheduler: " << LoopSchedToString[LoopSched]
+        str << GridwiseGemm64::GetTypeString() << " LoopScheduler: " << LoopSchedToString[LoopSched]
             << ", PipelineVersion: " << PipelineVersionToString[PipelineVer];
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp
index eda966c48a..f9e164e7b3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -70,12 +70,17 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
                                                                            CElementwiseOperation,
                                                                            ComputeType>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
-    using GridwiseGemm = GridwiseGemm_xdlops_splitk_lds_direct_load<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdlops_splitk_lds_direct_load<
         BlockSize,
         ADataType,
         BDataType,
@@ -96,7 +101,7 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferSrcAccessOrder,
         ABlockTransferSrcVectorDim,
@@ -114,8 +119,10 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
         LoopSched,
         PipelineVer,
         ComputeType>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    struct Argument : public GridwiseGemm::Argument
+    struct Argument : public GridwiseGemm64::Argument
     {
         Argument(const ADataType* p_a_grid_,
                  const BDataType* p_b_grid_,
@@ -134,20 +141,20 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
                  AElementwiseOperation a_element_op_,
                  BElementwiseOperation b_element_op_,
                  CElementwiseOperation c_element_op_)
-            : GridwiseGemm::Argument(p_a_grid_,
-                                     p_b_grid_,
-                                     p_c_grid_,
-                                     M_,
-                                     N_,
-                                     K_,
-                                     StrideA_,
-                                     StrideB_,
-                                     StrideC_,
-                                     MPadded_,
-                                     NPadded_,
-                                     KPadded_,
-                                     K0Padded_,
-                                     k_batch_),
+            : GridwiseGemm64::Argument(p_a_grid_,
+                                       p_b_grid_,
+                                       p_c_grid_,
+                                       M_,
+                                       N_,
+                                       K_,
+                                       StrideA_,
+                                       StrideB_,
+                                       StrideC_,
+                                       MPadded_,
+                                       NPadded_,
+                                       KPadded_,
+                                       K0Padded_,
+                                       k_batch_),
               a_element_op(a_element_op_),
               b_element_op(b_element_op_),
               c_element_op(c_element_op_)
@@ -159,15 +166,19 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
         CElementwiseOperation c_element_op;
     };
 
-    using DefaultBlock2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
+    using DefaultBlock2CTileMap = typename GridwiseGemm64::DefaultBlock2CTileMap;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
+        template <typename Argument_>
+        void Print(const Argument_& karg)
+        {
+            karg.Print();
+        }
 
-        void Print(const Argument& karg) { karg.Print(); }
-
-        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -175,8 +186,8 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
             }
 
             const auto kbatch = karg.k_batch;
-
-            if(!GridwiseGemm::CheckValidity(karg))
+            auto arg          = *reinterpret_cast<const typename GridwiseGemm::Argument*>(&karg);
+            if(!GridwiseGemm::CheckValidity(arg))
             {
                 throw std::runtime_error(
                     "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid "
@@ -199,17 +210,16 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
                                                      karg.M * karg.N * sizeof(CDataType),
                                                      stream_config.stream_id_));
 
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(gdx, gdy, gdz),
-                                           dim3(BlockSize),
-                                           0,
-                                           static_cast<typename GridwiseGemm::Argument>(karg),
-                                           b2c_map,
-                                           karg.a_element_op,
-                                           karg.b_element_op,
-                                           karg.c_element_op);
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(gdx, gdy, gdz),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg,
+                                                  b2c_map,
+                                                  karg.a_element_op,
+                                                  karg.b_element_op,
+                                                  karg.c_element_op);
             };
 
             if(has_main_k0_block_loop)
@@ -274,6 +284,8 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -290,12 +302,31 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
 
     static bool IsSupportedArgument(const Argument& karg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_lds_direct_load_supported())
+        {
+            return false;
+        }
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(karg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(karg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(karg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -327,10 +358,10 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
                         StrideA,
                         StrideB,
                         StrideC,
-                        GridwiseGemm::CalculateMPadded(M),
-                        GridwiseGemm::CalculateNPadded(N),
-                        GridwiseGemm::CalculateKPadded(K, KBatch),
-                        GridwiseGemm::CalculateK0Padded(K, KBatch),
+                        GridwiseGemm64::CalculateMPadded(M),
+                        GridwiseGemm64::CalculateNPadded(N),
+                        GridwiseGemm64::CalculateKPadded(K, KBatch),
+                        GridwiseGemm64::CalculateK0Padded(K, KBatch),
                         KBatch,
                         a_element_op,
                         b_element_op,
@@ -363,10 +394,10 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
                                           StrideA,
                                           StrideB,
                                           StrideC,
-                                          GridwiseGemm::CalculateMPadded(M),
-                                          GridwiseGemm::CalculateNPadded(N),
-                                          GridwiseGemm::CalculateKPadded(K, KBatch),
-                                          GridwiseGemm::CalculateK0Padded(K, KBatch),
+                                          GridwiseGemm64::CalculateMPadded(M),
+                                          GridwiseGemm64::CalculateNPadded(N),
+                                          GridwiseGemm64::CalculateKPadded(K, KBatch),
+                                          GridwiseGemm64::CalculateK0Padded(K, KBatch),
                                           KBatch,
                                           a_element_op,
                                           b_element_op,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
index 51b8958d61..c28930d0b6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -68,12 +68,17 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
                                                        BElementwiseOperation,
                                                        CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
-    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk<
         BlockSize,
         BlockToCTileMap_GemmStreamK<MPerBlock,
                                     NPerBlock,
@@ -95,7 +100,7 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
         NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -116,15 +121,23 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
         CShuffleNRepeatPerShuffle,
         CBlockTransferScalarPerVector_NWaveNPerXDL,
         CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        void Print(const Argument& karg) { karg.Print(); }
+        template <typename Argument_>
+        void Print(const Argument_& karg)
+        {
+            karg.Print();
+        }
 
-        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& karg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -204,6 +217,8 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -215,15 +230,25 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
     size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
     {
         const Argument* p_arg = dynamic_cast<const Argument*>(pArg);
-        if constexpr(GridwiseGemm::Block2CTileMap::ReductionStrategy ==
-                     StreamKReductionStrategy::Reduction)
+        if(get_warp_size() == 64)
         {
-            return p_arg->block_mapping.get_workspace_size(sizeof(typename GridwiseGemm::FloatAcc));
+            if constexpr(GridwiseGemm64::Block2CTileMap::ReductionStrategy ==
+                         StreamKReductionStrategy::Reduction)
+            {
+                return p_arg->block_mapping.get_workspace_size(
+                    sizeof(typename GridwiseGemm64::FloatAcc));
+            }
         }
         else
         {
-            return 0;
+            if constexpr(GridwiseGemm32::Block2CTileMap::ReductionStrategy ==
+                         StreamKReductionStrategy::Reduction)
+            {
+                return p_arg->block_mapping.get_workspace_size(
+                    sizeof(typename GridwiseGemm32::FloatAcc));
+            }
         }
+        return 0;
     }
 
     void SetWorkSpacePointer(BaseArgument* pArg,
@@ -243,11 +268,26 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
 
     static bool IsSupportedArgument(const Argument& karg)
     {
-        if(!(ck::is_xdl_supported()))
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-        return GridwiseGemm::CheckValidity(karg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(karg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(karg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -270,12 +310,38 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
                              CElementwiseOperation,
                              uint32_t NumSKBlocks = 0xffffffff)
     {
-        const auto kernel = kernel_gemm_xdlops_streamk<GridwiseGemm>;
-        int occupancy, num_cu;
+        int num_cu;
         hipError_t rtn;
-        rtn = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-            &occupancy, kernel, BlockSize, GridwiseGemm::GetSharedMemoryNumberOfByte());
-        hip_check_error(rtn);
+        int occupancy = [&]() {
+            int occupancy_ = 0;
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    const auto kernel = kernel_gemm_xdlops_streamk<GridwiseGemm64>;
+                    rtn               = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                        &occupancy_,
+                        kernel,
+                        BlockSize,
+                        GridwiseGemm64::GetSharedMemoryNumberOfByte());
+                    hip_check_error(rtn);
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    const auto kernel = kernel_gemm_xdlops_streamk<GridwiseGemm32>;
+                    rtn               = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                        &occupancy_,
+                        kernel,
+                        BlockSize,
+                        GridwiseGemm32::GetSharedMemoryNumberOfByte());
+                    hip_check_error(rtn);
+                }
+            }
+            return occupancy_;
+        }();
 
         hipDeviceProp_t dev_prop;
         hipDevice_t dev;
@@ -316,12 +382,39 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
                                                       CElementwiseOperation,
                                                       index_t NumSKBlocks = 0) override
     {
-        const auto kernel = kernel_gemm_xdlops_streamk<GridwiseGemm>;
-        int occupancy, num_cu;
+        int num_cu;
         hipError_t rtn;
-        rtn = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-            &occupancy, kernel, BlockSize, GridwiseGemm::GetSharedMemoryNumberOfByte());
-        hip_check_error(rtn);
+
+        int occupancy = [&]() {
+            int occupancy_ = 0;
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    const auto kernel = kernel_gemm_xdlops_streamk<GridwiseGemm64>;
+                    rtn               = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                        &occupancy_,
+                        kernel,
+                        BlockSize,
+                        GridwiseGemm64::GetSharedMemoryNumberOfByte());
+                    hip_check_error(rtn);
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    const auto kernel = kernel_gemm_xdlops_streamk<GridwiseGemm32>;
+                    rtn               = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                        &occupancy_,
+                        kernel,
+                        BlockSize,
+                        GridwiseGemm32::GetSharedMemoryNumberOfByte());
+                    hip_check_error(rtn);
+                }
+            }
+            return occupancy_;
+        }();
 
         hipDeviceProp_t dev_prop;
         hipDevice_t dev;
@@ -352,7 +445,11 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
     }
 
     // polymorphic
-    std::string GetTypeString() const override { return GridwiseGemm::GetTypeString(); }
+    std::string GetTypeString() const override
+    {
+        return get_warp_size() == 64 ? GridwiseGemm64::GetTypeString()
+                                     : GridwiseGemm32::GetTypeString();
+    }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
index 1042f8948c..fd490fbd1c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -46,20 +46,23 @@ __launch_bounds__(CK_WAVELET_MAX_THREAD_PER_BLOCK, CK_WAVELET_MIN_BLOCK_PER_CU)
                                               e_grid_desc_mblock_mperblock_nblock_nperblock,
                                           const Block2ETileMap block_2_etile_map)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_e_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  e_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                      p_b_grid,
+                                                      p_e_grid,
+                                                      p_shared,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      e_element_op,
+                                                      a_grid_desc_ak0_m_ak1,
+                                                      b_grid_desc_bk0_n_bk1,
+                                                      e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      block_2_etile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -132,6 +135,11 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
                                                                 BElementwiseOperation,
                                                                 CDEElementwiseOperation>
 {
+    static constexpr auto BlockSize = math::max(TileLoadThreadGroupSize, TileMathThreadGroupSize);
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
     using DeviceOp = DeviceGemm_Xdl_WaveletModel_CShuffle;
 
     static constexpr auto I0 = Number<0>{};
@@ -201,7 +209,8 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
     using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N<ELayout>(1, 1, 1));
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle<
         ADataType, // TODO: distinguish A/B datatype
         GemmAcEDataType,
         CShuffleDataType,
@@ -224,7 +233,7 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -245,15 +254,17 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
         CShuffleNXdlPerWavePerShuffle,
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CShuffleBlockTransferScalarPerVector_NPerBlock>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
 
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+    using Block2ETileMap = typename GridwiseGemm64::DefaultBlock2ETileMap;
 
     // Argument
     struct Argument : public BaseArgument
@@ -277,22 +288,14 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
               b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(KRaw, NRaw, StrideB)},
               e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<ELayout>(MRaw, NRaw, StrideE)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
-              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op}
         {
-            if(GridwiseGemm::CheckValidity(
-                   a_grid_desc_m_k_, b_grid_desc_n_k_, e_grid_desc_m_n_, block_2_etile_map_))
-            {
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-            }
         }
 
         void Print() const
@@ -316,8 +319,6 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
         // tensor descriptors for block/thread-wise copy
         AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
 
         // block-to-e-tile map
         Block2ETileMap block_2_etile_map_;
@@ -333,7 +334,8 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -359,6 +361,9 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
+            auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.e_grid_desc_m_n_);
 
             const index_t grid_size = GridwiseGemm::CalculateGridSize(arg.e_grid_desc_m_n_);
             const auto K            = arg.a_grid_desc_m_k_.GetLength(I1);
@@ -393,7 +398,7 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
                     arg.cde_element_op_,
                     arg.a_grid_desc_ak0_m_ak1_,
                     arg.b_grid_desc_bk0_n_bk1_,
-                    arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                    e_grid_desc_mblock_mperblock_nblock_nperblock,
                     arg.block_2_etile_map_);
             };
 
@@ -407,6 +412,8 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
             }
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -417,15 +424,31 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                           arg.b_grid_desc_n_k_,
-                                           arg.e_grid_desc_m_n_,
-                                           arg.block_2_etile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        return false;
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index 5449525306..68e63fcb5b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -37,47 +37,50 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const BElementwiseOperation b_element_op,
         const CDEElementwiseOperation cde_element_op)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    const index_t block_id = get_block_1d_id();
-
-    const auto contraction_arg_ptr = reinterpret_cast<const ContractionMultiDKernelArg*>(
-        cast_pointer_to_generic_address_space(contraction_args));
-
-    index_t left     = 0;
-    index_t right    = group_count;
-    index_t group_id = index_t((left + right) / 2);
-
-    while((!(block_id >= contraction_arg_ptr[group_id].block_start_ &&
-             block_id < contraction_arg_ptr[group_id].block_end_)) &&
-          left <= right)
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
     {
-        if(block_id < contraction_arg_ptr[group_id].block_start_)
-        {
-            right = group_id;
-        }
-        else
-        {
-            left = group_id;
-        }
-        group_id = index_t((left + right) / 2);
-    }
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-        contraction_arg_ptr[group_id].p_a_grid_,
-        contraction_arg_ptr[group_id].p_b_grid_,
-        contraction_arg_ptr[group_id].p_ds_grid_,
-        contraction_arg_ptr[group_id].p_e_grid_,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        cde_element_op,
-        contraction_arg_ptr[group_id].a_grid_desc_ak0_m_ak1_,
-        contraction_arg_ptr[group_id].b_grid_desc_bk0_n_bk1_,
-        contraction_arg_ptr[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-        contraction_arg_ptr[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
-        contraction_arg_ptr[group_id].block_2_etile_map_);
+        const index_t block_id = get_block_1d_id();
+
+        const auto contraction_arg_ptr = reinterpret_cast<const ContractionMultiDKernelArg*>(
+            cast_pointer_to_generic_address_space(contraction_args));
+
+        index_t left     = 0;
+        index_t right    = group_count;
+        index_t group_id = index_t((left + right) / 2);
+
+        while((!(block_id >= contraction_arg_ptr[group_id].block_start_ &&
+                 block_id < contraction_arg_ptr[group_id].block_end_)) &&
+              left <= right)
+        {
+            if(block_id < contraction_arg_ptr[group_id].block_start_)
+            {
+                right = group_id;
+            }
+            else
+            {
+                left = group_id;
+            }
+            group_id = index_t((left + right) / 2);
+        }
+
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+            contraction_arg_ptr[group_id].p_a_grid_,
+            contraction_arg_ptr[group_id].p_b_grid_,
+            contraction_arg_ptr[group_id].p_ds_grid_,
+            contraction_arg_ptr[group_id].p_e_grid_,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            contraction_arg_ptr[group_id].a_grid_desc_ak0_m_ak1_,
+            contraction_arg_ptr[group_id].b_grid_desc_bk0_n_bk1_,
+            contraction_arg_ptr[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+            contraction_arg_ptr[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            contraction_arg_ptr[group_id].block_2_etile_map_);
+    }
 #else
     ignore = contraction_args;
     ignore = group_count;
@@ -165,6 +168,9 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
 {
     using DeviceOp = DeviceGroupedContractionMultipleD_Xdl_CShuffle;
 
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     static constexpr auto I0 = Number<0>{};
@@ -357,7 +363,8 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
     using ComputeDataType = ADataType;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle<
         ADataType, // TODO: distinguish A/B datatype
         BDataType,
         ComputeDataType,
@@ -378,7 +385,7 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -400,31 +407,33 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // desc for blockwise copy
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     struct GroupedContractionBlock2ETileMap
     {
         // block-to-e-tile map
         using Block2ETileMap =
-            remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+            remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
         GroupedContractionBlock2ETileMap(const EGridDesc_M_N& e_grid_desc_m_n,
                                          ck::index_t BlockStart)
         {
-            default_block_2_etile_map_ = GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+            default_block_2_etile_map_ = GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
             block_start_               = BlockStart;
         }
 
@@ -457,7 +466,7 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptors for block/thread-wise copy
@@ -531,7 +540,7 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
                     contraction_descs[i].b_ns_ks_lengths, contraction_descs[i].b_ns_ks_strides);
 
                 DsGridDesc_M_N ds_grid_desc_m_n;
-                typename GridwiseGemm::DsGridPointer p_ds_grid;
+                typename GridwiseGemm64::DsGridPointer p_ds_grid;
 
                 // populate pointer, batch stride, desc for Ds
                 static_for<0, NumDTensor, 1>{}([&](auto j) {
@@ -550,19 +559,19 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
                     contraction_descs[i].e_ms_ns_lengths, contraction_descs[i].e_ms_ns_strides);
 
                 const auto a_grid_desc_ak0_m_ak1 =
-                    GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k);
+                    GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k);
                 const auto b_grid_desc_bk0_n_bk1 =
-                    GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k);
+                    GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k);
 
                 const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
-                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                         ds_grid_desc_m_n);
                 const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                         e_grid_desc_m_n);
 
                 const index_t grid_size_grp =
-                    GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n)
+                    GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n)
                         .CalculateGridSize(e_grid_desc_m_n);
 
                 const index_t BlockStart = grid_size_;
@@ -592,11 +601,30 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
                 const index_t e_nz_stride =
                     contraction_descs[i].e_ms_ns_strides[NumDimM + NumDimN - 1];
 
-                if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                               b_grid_desc_n_k,
-                                               ds_grid_desc_m_n,
-                                               e_grid_desc_m_n,
-                                               block_2_etile_map))
+                bool valid = false;
+                if(get_warp_size() == 64)
+                {
+                    if constexpr(NXdlPerWave64 > 0)
+                    {
+                        valid = GridwiseGemm64::CheckValidity(a_grid_desc_m_k,
+                                                              b_grid_desc_n_k,
+                                                              ds_grid_desc_m_n,
+                                                              e_grid_desc_m_n,
+                                                              block_2_etile_map);
+                    }
+                }
+                else
+                {
+                    if constexpr(NXdlPerWave32 > 0)
+                    {
+                        valid = GridwiseGemm32::CheckValidity(a_grid_desc_m_k,
+                                                              b_grid_desc_n_k,
+                                                              ds_grid_desc_m_n,
+                                                              e_grid_desc_m_n,
+                                                              block_2_etile_map);
+                    }
+                }
+                if(valid)
                 {
                     contraction_multi_d_kernel_args_.push_back(
                         {p_a_grid,
@@ -642,7 +670,8 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             bool has_main_k_block_loop = true;
 
@@ -701,6 +730,8 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -711,11 +742,10 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         for(std::size_t i = 0; i < arg.group_count_; i++)
         {
             const auto a_grid_desc_m_k_ = arg.contraction_multi_d_device_args_[i].a_grid_desc_m_k_;
@@ -744,11 +774,30 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
             const auto ds_nz_stride_ = arg.contraction_multi_d_device_args_[i].ds_nz_stride_;
             const auto e_nz_stride_  = arg.contraction_multi_d_device_args_[i].e_nz_stride_;
 
-            if(!GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                            b_grid_desc_n_k_,
-                                            ds_grid_desc_m_n_,
-                                            e_grid_desc_m_n_,
-                                            block_2_etile_map_))
+            bool valid = false;
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    valid = GridwiseGemm64::CheckValidity(a_grid_desc_m_k_,
+                                                          b_grid_desc_n_k_,
+                                                          ds_grid_desc_m_n_,
+                                                          e_grid_desc_m_n_,
+                                                          block_2_etile_map_);
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    valid = GridwiseGemm32::CheckValidity(a_grid_desc_m_k_,
+                                                          b_grid_desc_n_k_,
+                                                          ds_grid_desc_m_n_,
+                                                          e_grid_desc_m_n_,
+                                                          block_2_etile_map_);
+                }
+            }
+            if(!valid)
             {
                 return false;
             }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 25923235c3..3d6f34f121 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -96,83 +96,64 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfN compute_ptr_offset_of_n,
         const index_t KBatch)
 {
-#if defined(__gfx9__)
-    // offset base pointer for each work-group
-    const index_t block_args_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
-    const index_t g_idx         = __builtin_amdgcn_readfirstlane(blockIdx.y);
-    const index_t n_idx         = __builtin_amdgcn_readfirstlane(blockIdx.z / KBatch);
-    const index_t k_idx         = __builtin_amdgcn_readfirstlane(blockIdx.z - n_idx * KBatch);
-
-    const long_index_t a_batch_offset =
-        CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))
-                   : amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))
-                   : amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
-
-    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
-
-    const long_index_t a_n_offset =
-        CTranspose ? 0 : amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
-    const long_index_t b_n_offset =
-        CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)) : 0;
-
-    const long_index_t e_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
-
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    DsPointer p_ds_grid_grp;
-
-    static constexpr index_t NumDTensor = DsPointer::Size();
-
-    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
-
-    index_t left     = 0;
-    index_t right    = gemms_count;
-    index_t group_id = index_t((left + right) / 2);
-    while((!(block_args_id >= gemm_kernel_args[group_id].BlockStart_ &&
-             block_args_id < gemm_kernel_args[group_id].BlockEnd_)) &&
-          left <= right)
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<OutElementOp>())
     {
-        if(block_args_id < gemm_kernel_args[group_id].BlockStart_)
+        // offset base pointer for each work-group
+        const index_t block_args_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const index_t g_idx         = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const index_t n_idx         = __builtin_amdgcn_readfirstlane(blockIdx.z / KBatch);
+        const index_t k_idx         = __builtin_amdgcn_readfirstlane(blockIdx.z - n_idx * KBatch);
+
+        const long_index_t a_batch_offset =
+            CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))
+                       : amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        const long_index_t b_batch_offset =
+            CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))
+                       : amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        const long_index_t e_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+
+        const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+        const long_index_t a_n_offset =
+            CTranspose ? 0 : amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+        const long_index_t b_n_offset =
+            CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)) : 0;
+
+        const long_index_t e_n_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+        DsPointer p_ds_grid_grp;
+
+        static constexpr index_t NumDTensor = DsPointer::Size();
+
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+        index_t left     = 0;
+        index_t right    = gemms_count;
+        index_t group_id = index_t((left + right) / 2);
+        while((!(block_args_id >= gemm_kernel_args[group_id].BlockStart_ &&
+                 block_args_id < gemm_kernel_args[group_id].BlockEnd_)) &&
+              left <= right)
         {
-            right = group_id;
+            if(block_args_id < gemm_kernel_args[group_id].BlockStart_)
+            {
+                right = group_id;
+            }
+            else
+            {
+                left = group_id;
+            }
+            group_id = index_t((left + right) / 2);
         }
-        else
-        {
-            left = group_id;
-        }
-        group_id = index_t((left + right) / 2);
-    }
 
-    if constexpr(HasMainKBlockLoopInAllGemm || NoMainKBlockLoopInAllGemm)
-    {
-        GridwiseGemm::template Run<HasMainKBlockLoopInAllGemm, OutElementOp>(
-            p_a_grid + a_batch_offset + a_n_offset,
-            p_b_grid + b_batch_offset + b_n_offset,
-            p_ds_grid_grp,
-            p_e_grid + e_batch_offset + e_n_offset,
-            p_shared,
-            a_element_op,
-            b_element_op,
-            cde_element_op,
-            gemm_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
-            gemm_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
-            gemm_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-            gemm_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            gemm_kernel_args[group_id].block_2_ctile_map_,
-            KBatch,
-            k_idx);
-    }
-    else
-    {
-        if(gemm_kernel_args[group_id].HasMainKBlockLoop_)
+        if constexpr(HasMainKBlockLoopInAllGemm || NoMainKBlockLoopInAllGemm)
         {
-            GridwiseGemm::template Run<true, OutElementOp>(
+            GridwiseGemm::template Run<HasMainKBlockLoopInAllGemm, OutElementOp>(
                 p_a_grid + a_batch_offset + a_n_offset,
                 p_b_grid + b_batch_offset + b_n_offset,
                 p_ds_grid_grp,
@@ -191,22 +172,44 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         }
         else
         {
-            GridwiseGemm::template Run<false, OutElementOp>(
-                p_a_grid + a_batch_offset + a_n_offset,
-                p_b_grid + b_batch_offset + b_n_offset,
-                p_ds_grid_grp,
-                p_e_grid + e_batch_offset + e_n_offset,
-                p_shared,
-                a_element_op,
-                b_element_op,
-                cde_element_op,
-                gemm_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
-                gemm_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
-                gemm_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                gemm_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                gemm_kernel_args[group_id].block_2_ctile_map_,
-                KBatch,
-                k_idx);
+            if(gemm_kernel_args[group_id].HasMainKBlockLoop_)
+            {
+                GridwiseGemm::template Run<true, OutElementOp>(
+                    p_a_grid + a_batch_offset + a_n_offset,
+                    p_b_grid + b_batch_offset + b_n_offset,
+                    p_ds_grid_grp,
+                    p_e_grid + e_batch_offset + e_n_offset,
+                    p_shared,
+                    a_element_op,
+                    b_element_op,
+                    cde_element_op,
+                    gemm_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
+                    gemm_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
+                    gemm_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                    gemm_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                    gemm_kernel_args[group_id].block_2_ctile_map_,
+                    KBatch,
+                    k_idx);
+            }
+            else
+            {
+                GridwiseGemm::template Run<false, OutElementOp>(
+                    p_a_grid + a_batch_offset + a_n_offset,
+                    p_b_grid + b_batch_offset + b_n_offset,
+                    p_ds_grid_grp,
+                    p_e_grid + e_batch_offset + e_n_offset,
+                    p_shared,
+                    a_element_op,
+                    b_element_op,
+                    cde_element_op,
+                    gemm_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
+                    gemm_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
+                    gemm_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                    gemm_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                    gemm_kernel_args[group_id].block_2_ctile_map_,
+                    KBatch,
+                    k_idx);
+            }
         }
     }
 #else
@@ -316,6 +319,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             : 32;
 
     using DeviceOp = DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr index_t NumDTensor          = DsDataType::Size();
     static constexpr GemmSpecialization GemmSpec = GemmSpecialization::MNKPadding;
@@ -436,7 +442,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 #define GridwiseGemmMultiDTemplateParams                                                        \
     ABDataType, ABDataType, AComputeType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
         AElementwiseOp, BElementwiseOp, CDEElementwiseOp, NumGemmKPrefetchStage, BlockSize,     \
-        MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave,  \
+        MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, \
         ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder,  \
         ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,                               \
         ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false,          \
@@ -451,7 +457,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 #define GridwiseGemmCTransposeTemplateParameters                                                \
     ABDataType, ABDataType, AComputeType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
         BElementwiseOp, AElementwiseOp, CDEElementwiseOp, NumGemmKPrefetchStage, BlockSize,     \
-        NPerBlock, MPerBlock, KPerBlock, BK1, AK1, NPerXDL, MPerXDL, NXdlPerWave, MXdlPerWave,  \
+        NPerBlock, MPerBlock, KPerBlock, BK1, AK1, NPerXDL, MPerXDL, NXdlPerWave_, MXdlPerWave, \
         BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder,  \
         BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,                               \
         BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false,          \
@@ -463,17 +469,26 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                       \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, BComputeType
 
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>;
-    using GridwiseGemmCTranspose = std::conditional_t<
-        CTranspose,
-        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmCTransposeTemplateParameters>,
-        GridwiseGemm>;
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>;
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmCTransposeBase =
+        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmCTransposeTemplateParameters>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
+
+    using GridwiseGemmCTranspose64 =
+        std::conditional_t<CTranspose,
+                           GridwiseGemmCTransposeBase<math::max(NXdlPerWave64, 1)>,
+                           GridwiseGemm64>;
+    using GridwiseGemmCTranspose32 =
+        std::conditional_t<CTranspose, GridwiseGemmCTransposeBase<NXdlPerWave32>, GridwiseGemm32>;
 
     template <typename EGridDesc_M_N>
     static auto
     MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N e_grid_desc_m_n)
     {
-        return GridwiseGemmCTranspose::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        return GridwiseGemmCTranspose64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             e_grid_desc_m_n);
     }
 
@@ -504,14 +519,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     using BGridDesc_N_K = decltype(transform_k0_m_k1_to_m_k(BGridDesc_BK0_N_BK1{}));
 
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemmCTranspose::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemmCTranspose64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}));
     using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
         decltype(MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
 
     // block-to-e-tile map
     using Block2ETileMap =
-        decltype(GridwiseGemmCTranspose::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}));
+        decltype(GridwiseGemmCTranspose64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}));
 
     using GroupedGemmBlock2ETileMap = OffsettedBlockToCTileMap<Block2ETileMap>;
 
@@ -917,14 +932,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
                         const auto GemmK = a_grid_desc_m_k.GetLength(I1);
                         const bool HasMainKBlockLoop =
-                            GridwiseGemmCTranspose::CalculateHasMainKBlockLoop(GemmK, k_batch_);
+                            GridwiseGemmCTranspose64::CalculateHasMainKBlockLoop(GemmK, k_batch_);
 
                         gemm_kernel_args_[gemms_count_ /
                                           MaxGroupedGemmGroupsNum][gemms_count_ %
                                                                    MaxGroupedGemmGroupsNum] =
                             GemmArgs{a_grid_desc_ak0_m_ak1,
                                      b_grid_desc_bk0_n_bk1,
-                                     GridwiseGemmCTranspose::
+                                     GridwiseGemmCTranspose64::
                                          MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                                              ds_grid_desc_m_n),
                                      MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
@@ -1069,7 +1084,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptor for problem definition
@@ -1122,7 +1137,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     {
         using Argument = DeviceOp::Argument;
 
-        template <InMemoryDataOperationEnum ElementOp>
+        template <typename GridwiseGemm,
+                  typename GridwiseGemmCTranspose,
+                  InMemoryDataOperationEnum ElementOp>
         float RunMultiDGemm(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
@@ -1281,7 +1298,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             return ave_time;
         }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm, typename GridwiseGemmCTranspose>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
 
@@ -1372,12 +1390,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                 if constexpr(IsSplitKSupported)
                 {
                     ave_time +=
-                        RunMultiDGemm<InMemoryDataOperationEnum::AtomicAdd>(arg, stream_config);
+                        RunMultiDGemm<GridwiseGemm,
+                                      GridwiseGemmCTranspose,
+                                      InMemoryDataOperationEnum::AtomicAdd>(arg, stream_config);
                 }
             }
             else
             {
-                ave_time += RunMultiDGemm<InMemoryDataOperationEnum::Set>(arg, stream_config);
+                ave_time += RunMultiDGemm<GridwiseGemm,
+                                          GridwiseGemmCTranspose,
+                                          InMemoryDataOperationEnum::Set>(arg, stream_config);
             }
 
             // Transpose from NHWGC to NGCHW
@@ -1427,6 +1449,31 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
             return ave_time;
         }
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64, GridwiseGemmCTranspose64>(arg, stream_config);
+                }
+                else
+                {
+                    return 0;
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32, GridwiseGemmCTranspose32>(arg, stream_config);
+                }
+                else
+                {
+                    return 0;
+                }
+            }
+        }
 
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -1437,11 +1484,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        // gfx11 doesn't support float atomic
+        // Todo: Enable splitK for gfx12
+        if((ck::is_gfx12_supported() || ck::is_gfx11_supported()) && arg.k_batch_ > 1)
+        {
+            return false;
+        }
+        if(!ck::is_xdl_wmma_supported<AComputeType, BComputeType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<EDataType, ck::bhalf_t> &&
            arg.k_batch_ > 1)
         {
@@ -1598,16 +1650,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         }
 
         // Gridwise GEMM size
+        bool isWave64 = get_warp_size() == 64;
         for(std::size_t i = 0; i < arg.a_grid_desc_m_k_container_.size(); i++)
         {
-            if(!GridwiseGemmCTranspose::CheckValidity(
-                   arg.a_grid_desc_m_k_container_[i],
-                   arg.b_grid_desc_n_k_container_[i],
-                   arg.ds_grid_desc_m_n_container_[i],
-                   arg.e_grid_desc_m_n_container_[i],
-                   arg.gemm_kernel_args_[i / MaxGroupedGemmGroupsNum][i % MaxGroupedGemmGroupsNum]
-                       .block_2_ctile_map_,
-                   arg.k_batch_))
+            bool valid = true;
+            if(isWave64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    if(!GridwiseGemmCTranspose64::CheckValidity(
+                           arg.a_grid_desc_m_k_container_[i],
+                           arg.b_grid_desc_n_k_container_[i],
+                           arg.ds_grid_desc_m_n_container_[i],
+                           arg.e_grid_desc_m_n_container_[i],
+                           arg.gemm_kernel_args_[i / MaxGroupedGemmGroupsNum]
+                                                [i % MaxGroupedGemmGroupsNum]
+                                                    .block_2_ctile_map_,
+                           arg.k_batch_))
+                    {
+                        valid = false;
+                    }
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    if(!GridwiseGemmCTranspose32::CheckValidity(
+                           arg.a_grid_desc_m_k_container_[i],
+                           arg.b_grid_desc_n_k_container_[i],
+                           arg.ds_grid_desc_m_n_container_[i],
+                           arg.e_grid_desc_m_n_container_[i],
+                           arg.gemm_kernel_args_[i / MaxGroupedGemmGroupsNum]
+                                                [i % MaxGroupedGemmGroupsNum]
+                                                    .block_2_ctile_map_,
+                           arg.k_batch_))
+                    {
+                        valid = false;
+                    }
+                }
+            }
+            if(!valid)
             {
                 return false;
             }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
index 5d68ca720a..be94da1e50 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
@@ -144,18 +144,39 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
                       end(e_g_k_c_xs_lengths),
                       begin(filter_spatial_lengths_));
 
-            if(split_k < 0)
+            if constexpr(IsTwoStageNeeded)
             {
-                const auto max_occupancy = DeviceGemmV3Op::GetMaxOccupancy();
-                index_t gdx, gdy, gdz;
-                std::tie(gdx, gdy, gdz) =
-                    DeviceGemmV3Op::GridwiseGemm::CalculateGridSize(M, N, BatchSize);
-                const index_t grid_size = gdx * gdy * gdz;
-                split_k_ = get_best_occupancy_k_batch_value(max_occupancy, grid_size);
+                if(split_k < 0)
+                {
+                    const auto max_occupancy = DeviceGemmV3Op::GetMaxOccupancy();
+                    index_t gdx, gdy, gdz;
+                    std::tie(gdx, gdy, gdz) =
+                        DeviceGemmV3Op::GridwiseGemm::CalculateGridSize(M, N, BatchSize);
+                    const index_t grid_size = gdx * gdy * gdz;
+                    split_k_ = get_best_occupancy_k_batch_value(max_occupancy, grid_size);
+                }
+                else
+                {
+                    split_k_ = split_k;
+                }
             }
             else
             {
-                split_k_ = split_k;
+#if !DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS
+                if(split_k < 0)
+                {
+                    const auto max_occupancy = DeviceGemmV3Op::GetMaxOccupancy();
+                    index_t gdx, gdy, gdz;
+                    std::tie(gdx, gdy, gdz) =
+                        DeviceGemmV3Op::GridwiseGemm::CalculateGridSize(M, N, BatchSize);
+                    const index_t grid_size = gdx * gdy * gdz;
+                    split_k_ = get_best_occupancy_k_batch_value(max_occupancy, grid_size);
+                }
+                else
+#endif
+                {
+                    split_k_ = split_k;
+                }
             }
 
             if constexpr(IsTwoStageNeeded)
@@ -318,6 +339,16 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
 
     static bool IsSupportedArgument(const Argument& arg)
     {
+#if DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS
+        if constexpr(!IsTwoStageNeeded)
+        {
+            if(arg.split_k_ < 0)
+            {
+                return false;
+            }
+        }
+#endif
+
         if constexpr(NDimSpatial == 2)
         {
             if constexpr(!is_NHWGC_GKYXC_NHWGK<InLayout, WeiLayout, OutLayout>())
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
index b761939642..987a1e273a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -61,31 +61,35 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                           const Block2CTileMap block_2_ctile_map,
                                           const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if defined(__gfx9__)
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t c_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
+        const long_index_t a_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        const long_index_t b_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        const long_index_t c_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
 
-    __shared__ FloatA p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatA)];
+        __shared__ FloatA p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatA)];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_c_grid + c_batch_offset,
-                                                  p_shared,
-                                                  a_b_k0_m_k1_grid_desc,
-                                                  b_b_k0_n_k1_grid_desc,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                      p_b_grid + b_batch_offset,
+                                                      p_c_grid + c_batch_offset,
+                                                      p_shared,
+                                                      a_b_k0_m_k1_grid_desc,
+                                                      b_b_k0_n_k1_grid_desc,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op,
+                                                      block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -125,8 +129,8 @@ template <index_t NDimSpatial,
           ck::index_t NPerBlock,
           ck::index_t K0PerBlock,
           ck::index_t K1,
-          ck::index_t MPerXdl,
-          ck::index_t NPerXdl,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
           ck::index_t MXdlPerWave,
           ck::index_t NXdlPerWave,
           typename ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -166,6 +170,9 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                                                  ComputeTypeB>
 {
     using DeviceOp = DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     using ADataType = OutDataType;
     using BDataType = InDataType;
@@ -298,7 +305,8 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
     using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
-    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
         BlockSize,
         ADataType,
         BDataType,
@@ -314,11 +322,11 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
         MPerBlock,
         NPerBlock,
         K0PerBlock,
-        MPerXdl,
-        NPerXdl,
+        MPerXDL,
+        NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -351,6 +359,8 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
         PipelineVersion::v1,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     static constexpr auto MakeElementwiseInputSequence()
     {
@@ -539,14 +549,15 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
 
     // Argument
     using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
+        decltype(GridwiseGemm64::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
 
     using Block2CTileMap =
-        decltype(GridwiseGemm::MakeCBlockClusterAdaptor(CGridDesc_M_N{}, 1, 1, 1));
+        decltype(GridwiseGemm64::MakeCBlockClusterAdaptor(CGridDesc_M_N{}, 1, 1, 1));
 
     struct ActiveWorkgroupsPerCU
     {
-        ActiveWorkgroupsPerCU()
+        template <typename GridwiseGemm>
+        int GetMaxOccupancy()
         {
             constexpr int dynamic_smem_size = 0;
             int max_occupancy               = 0;
@@ -568,7 +579,26 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                     true>,
                 BlockSize,
                 dynamic_smem_size));
-            max_occupancy_ = std::max(1, max_occupancy);
+            return std::max(1, max_occupancy);
+        }
+
+        ActiveWorkgroupsPerCU()
+        {
+            max_occupancy_ = 1;
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    max_occupancy_ = GetMaxOccupancy<GridwiseGemm64>();
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    max_occupancy_ = GetMaxOccupancy<GridwiseGemm32>();
+                }
+            }
         }
         int max_occupancy_;
     };
@@ -605,7 +635,6 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
               a_grid_desc_kbatch_k0_m_k1_{},
               b_grid_desc_kbatch_k0_n_k1_{},
               ce_grid_desc_m_n_{},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
               block_2_ctile_map_{},
               compute_ptr_offset_of_batch_{},
               M01_{M01},
@@ -642,6 +671,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                       end(a_g_n_k_wos_lengths),
                       begin(output_spatial_lengths_));
 
+#if !DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS
             if(split_k < 0)
             {
                 ck::index_t gemmM, gemmN;
@@ -654,6 +684,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                                                             grid_size);
             }
             else
+#endif
             {
                 k_batch_ = split_k;
             }
@@ -695,7 +726,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                 MakeDsGridDescriptor_M_N<NDimSpatial>(ds_g_k_c_xs_lengths, ds_g_k_c_xs_strides);
 
             block_2_ctile_map_ =
-                GridwiseGemm::MakeCBlockClusterAdaptor(ce_grid_desc_m_n_, M01, N01, k_batch_);
+                GridwiseGemm64::MakeCBlockClusterAdaptor(ce_grid_desc_m_n_, M01, N01, k_batch_);
             elementwise_block_2_ctile_map_ = Block2TileMapElementwise{
                 ce_grid_desc_m_n_.GetLength(I0), ce_grid_desc_m_n_.GetLength(I1)};
 
@@ -708,16 +739,6 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                                 end(filter_spatial_lengths_),
                                 index_t{1},
                                 std::multiplies<>{});
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
-                                           b_grid_desc_kbatch_k0_n_k1_,
-                                           ce_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(
-                        ce_grid_desc_m_n_);
-            }
         }
 
         std::size_t GetWorkspaceSizeBytes() const
@@ -733,7 +754,6 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
         AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
         BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
         CGridDesc_M_N ce_grid_desc_m_n_;
-        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
         DsGridDesc_M_N ds_grid_descs_tuple_;
 
         Block2CTileMap block_2_ctile_map_;
@@ -786,7 +806,8 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                       << arg.ce_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
                                             arg.b_grid_desc_kbatch_k0_n_k1_,
@@ -797,6 +818,9 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                     "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
             }
 
+            auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.ce_grid_desc_m_n_);
             const auto K0                     = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
             const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
 
@@ -843,7 +867,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                     arg.Conv_G_,
                     arg.a_grid_desc_kbatch_k0_m_k1_,
                     arg.b_grid_desc_kbatch_k0_n_k1_,
-                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
                     arg.block_2_ctile_map_,
                     arg.compute_ptr_offset_of_batch_);
             };
@@ -900,6 +924,8 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
             return avg_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -915,7 +941,13 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+#if DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS
+        if(arg.k_batch_ < 0)
+        {
+            return false;
+        }
+#endif
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
@@ -977,10 +1009,27 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
         }
 
         // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.ce_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                                     arg.b_grid_desc_kbatch_k0_n_k1_,
+                                                     arg.ce_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                                     arg.b_grid_desc_kbatch_k0_n_k1_,
+                                                     arg.ce_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index 4565074b3e..e38768b2fa 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -57,33 +57,36 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         [[maybe_unused]] const index_t num_k_per_block)
 {
-#if defined(__gfx9__)
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
-    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
+        const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
 
-    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+        const long_index_t a_batch_offset = amd_wave_read_first_lane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        const long_index_t b_batch_offset = amd_wave_read_first_lane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        const long_index_t e_batch_offset = amd_wave_read_first_lane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
-                               BGridDesc_BK0_N_K1,
-                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                               HasMainKBlockLoop,
-                               CGlobalMemoryDataOperation,
-                               TailNum>(karg.p_a_grid + a_batch_offset,
-                                        karg.p_b_grid + b_batch_offset,
-                                        karg.p_c_grid + e_batch_offset,
-                                        p_shared,
-                                        karg,
-                                        a_grid_desc_ak0_m_ak1,
-                                        b_grid_desc_bk0_n_bk1,
-                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                        k_idx);
+        GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
+                                   BGridDesc_BK0_N_K1,
+                                   CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                   HasMainKBlockLoop,
+                                   CGlobalMemoryDataOperation,
+                                   TailNum>(karg.p_a_grid + a_batch_offset,
+                                            karg.p_b_grid + b_batch_offset,
+                                            karg.p_c_grid + e_batch_offset,
+                                            p_shared,
+                                            karg,
+                                            a_grid_desc_ak0_m_ak1,
+                                            b_grid_desc_bk0_n_bk1,
+                                            c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                            k_idx);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -112,38 +115,41 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         [[maybe_unused]] const index_t num_k_per_block)
 {
-#if defined(__gfx9__)
-    // offset base pointer for each work-group
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
-    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // offset base pointer for each work-group
+        const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
+        const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
 
-    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+        const long_index_t a_batch_offset = amd_wave_read_first_lane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        const long_index_t b_batch_offset = amd_wave_read_first_lane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        const long_index_t e_batch_offset = amd_wave_read_first_lane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
-                                    BGridDesc_BK0_N_K1,
-                                    CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    HasMainKBlockLoop,
-                                    CGlobalMemoryDataOperation,
-                                    TailNum>(karg.p_a_grid + a_batch_offset,
-                                             karg.p_b_grid + b_batch_offset,
-                                             karg.p_c_grid + e_batch_offset,
-                                             p_shared_0,
-                                             p_shared_1,
-                                             karg,
-                                             a_grid_desc_ak0_m_ak1,
-                                             b_grid_desc_bk0_n_bk1,
-                                             c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                             k_idx);
+        GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
+                                        BGridDesc_BK0_N_K1,
+                                        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                        HasMainKBlockLoop,
+                                        CGlobalMemoryDataOperation,
+                                        TailNum>(karg.p_a_grid + a_batch_offset,
+                                                 karg.p_b_grid + b_batch_offset,
+                                                 karg.p_c_grid + e_batch_offset,
+                                                 p_shared_0,
+                                                 p_shared_1,
+                                                 karg,
+                                                 a_grid_desc_ak0_m_ak1,
+                                                 b_grid_desc_bk0_n_bk1,
+                                                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                 k_idx);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -166,8 +172,8 @@ template <ck::index_t NDimSpatial,
           ck::index_t NPerBlock,
           ck::index_t KPerBlock,
           ck::index_t K1,
-          ck::index_t MPerXdl,
-          ck::index_t NPerXdl,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
           ck::index_t MXdlPerWave,
           ck::index_t NXdlPerWave,
           typename ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -214,6 +220,9 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
     static_assert(is_same_v<OutElementwiseOperation, element_wise::PassThrough>);
 
     using DeviceOp = DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     using ADataType = OutDataType;
     using BDataType = InDataType;
@@ -393,7 +402,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
     using CElementwiseGridDesc_M_N =
         remove_cvref_t<decltype(GetElementwiseCGridDesc<NDimSpatial>())>;
 
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_conv_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_conv_v3<
         tensor_layout::gemm::RowMajor,
         tensor_layout::gemm::ColumnMajor,
         tensor_layout::gemm::RowMajor,
@@ -412,10 +422,10 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         KPerBlock,
         K1,
         K1,
-        MPerXdl,
-        NPerXdl,
+        MPerXDL,
+        NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -440,6 +450,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         BlkGemmPipelineVer,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
@@ -503,12 +515,13 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
 
     // Argument
     using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             CGridDesc_M_N{}, 1, 1));
 
     struct ActiveWorkgroupsPerCU
     {
-        ActiveWorkgroupsPerCU()
+        template <typename GridwiseGemm>
+        int GetMaxOccupancy()
         {
             constexpr int dynamic_smem_size = 0;
             constexpr index_t minimum_occupancy =
@@ -549,7 +562,26 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                     BlockSize,
                     dynamic_smem_size));
             }
-            max_occupancy_ = std::max(1, max_occupancy);
+            return std::max(1, max_occupancy);
+        }
+
+        ActiveWorkgroupsPerCU()
+        {
+            max_occupancy_ = 1;
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    max_occupancy_ = GetMaxOccupancy<GridwiseGemm64>();
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    max_occupancy_ = GetMaxOccupancy<GridwiseGemm32>();
+                }
+            }
         }
         int max_occupancy_;
     };
@@ -704,10 +736,10 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideC_ = e_g_k_c_xs_strides_transposed[0];
             c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                GridwiseGemm64::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                     ce_grid_desc_m_n_,
-                    GridwiseGemm::CalculateMBlock(GemmM),
-                    GridwiseGemm::CalculateNBlock(GemmN));
+                    GridwiseGemm64::CalculateMBlock(GemmM),
+                    GridwiseGemm64::CalculateNBlock(GemmN));
 
             if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
                          is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
@@ -853,6 +885,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                       << arg.ce_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
 
+        template <typename GridwiseGemm>
         float RunGemmV3(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const index_t GemmM = arg.a_grid_desc_k0_m_k1_.GetLength(I1);
@@ -1506,7 +1539,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             return ave_time;
         }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float avg_time                 = 0.f;
             auto launch_elementwise_kernel = [&]() {
@@ -1626,11 +1660,13 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                                                    grid_size_a);
             }
 
-            avg_time += RunGemmV3(arg, stream_config);
+            avg_time += RunGemmV3<GridwiseGemm>(arg, stream_config);
             avg_time += launch_elementwise_kernel();
             return avg_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -1651,13 +1687,44 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         const index_t GemmK =
             arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
 
-        typename GridwiseGemm::Argument gemm_arg{
-            nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
-
-        const auto num_k_loop = gemm_arg.AK0 / (KPerBlock / K1);
-        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        if(get_warp_size() == 64)
         {
-            if(num_k_loop <= GridwiseGemm::BlockwiseGemmPipe::PrefetchStages)
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                typename GridwiseGemm64::Argument gemm_arg{
+                    nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+                const auto num_k_loop = gemm_arg.AK0 / (KPerBlock / K1);
+                if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+                {
+                    if(num_k_loop <= GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages)
+                    {
+                        return false;
+                    }
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                typename GridwiseGemm32::Argument gemm_arg{
+                    nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+                const auto num_k_loop = gemm_arg.AK0 / (KPerBlock / K1);
+                if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+                {
+                    if(num_k_loop <= GridwiseGemm32::BlockwiseGemmPipe::PrefetchStages)
+                    {
+                        return false;
+                    }
+                }
+            }
+            else
             {
                 return false;
             }
@@ -1676,7 +1743,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             }
             return false;
         }
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index 488dadf512..22fc13bae4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -59,28 +59,33 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                           const Block2CTileMap block_2_ctile_map,
                                           const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
-    const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
-    const long_index_t c_batch_offset = compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+        const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+        const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+        const long_index_t c_batch_offset = compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
 
-    __shared__ FloatA p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatA)];
+        __shared__ FloatA p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatA)];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_c_grid + c_batch_offset,
-                                                  p_shared,
-                                                  a_b_k0_m_k1_grid_desc,
-                                                  b_b_k0_n_k1_grid_desc,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                      p_b_grid + b_batch_offset,
+                                                      p_c_grid + c_batch_offset,
+                                                      p_shared,
+                                                      a_b_k0_m_k1_grid_desc,
+                                                      b_b_k0_n_k1_grid_desc,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op,
+                                                      block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -119,8 +124,8 @@ template <ck::index_t NDimSpatial,
           ck::index_t NPerBlock,
           ck::index_t K0PerBlock,
           ck::index_t K1,
-          ck::index_t MPerXdl,
-          ck::index_t NPerXdl,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
           ck::index_t MXdlPerWave,
           ck::index_t NXdlPerWave,
           typename ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -160,6 +165,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                         ComputeTypeB>
 {
     using DeviceOp = DeviceGroupedConvBwdWeight_Xdl_CShuffle;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     using ADataType = OutDataType;
     using BDataType = InDataType;
@@ -360,7 +368,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                             I1,
                             I0>;
 
-    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
         BlockSize,
         ADataType,
         BDataType,
@@ -376,11 +385,11 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         MPerBlock,
         NPerBlock,
         K0PerBlock,
-        MPerXdl,
-        NPerXdl,
+        MPerXDL,
+        NPerXDL,
         K1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -413,17 +422,20 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         PipelineVersion::v1,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
+        decltype(GridwiseGemm64::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
 
     using Block2CTileMap =
-        decltype(GridwiseGemm::MakeCBlockClusterAdaptor(CGridDesc_M_N{}, 1, 1, 1));
+        decltype(GridwiseGemm64::MakeCBlockClusterAdaptor(CGridDesc_M_N{}, 1, 1, 1));
 
     struct ActiveWorkgroupsPerCU
     {
-        ActiveWorkgroupsPerCU()
+        template <typename GridwiseGemm>
+        static int GetMaxOccupancy()
         {
             constexpr int dynamic_smem_size = 0;
             int max_occupancy               = 0;
@@ -445,7 +457,25 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     false>, // Both true/false give the same occupancy.
                 BlockSize,
                 dynamic_smem_size));
-            max_occupancy_ = std::max(1, max_occupancy);
+            return std::max(1, max_occupancy);
+        }
+        ActiveWorkgroupsPerCU()
+        {
+            max_occupancy_ = 1;
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    max_occupancy_ = GetMaxOccupancy<GridwiseGemm64>();
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    max_occupancy_ = GetMaxOccupancy<GridwiseGemm32>();
+                }
+            }
         }
         int max_occupancy_;
     };
@@ -477,7 +507,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
               a_grid_desc_kbatch_k0_m_k1_{},
               b_grid_desc_kbatch_k0_n_k1_{},
               c_grid_desc_m_n_{},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
               block_2_ctile_map_{},
               compute_ptr_offset_of_batch_{},
               M01_{M01},
@@ -524,6 +553,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                 conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(e_g_k_c_xs_lengths,
                                                                     e_g_k_c_xs_strides);
 
+#if !DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS
             if(split_k < 0)
             {
                 ck::index_t gemmM, gemmN;
@@ -536,6 +566,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                                             grid_size);
             }
             else
+#endif
             {
                 k_batch_ = split_k;
             }
@@ -563,22 +594,13 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             c_grid_desc_m_n_            = descs[I2];
 
             block_2_ctile_map_ =
-                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
+                GridwiseGemm64::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
 
             // A/B/C Batch Stride
             compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideC_ = e_g_k_c_xs_strides_transposed[0];
 
-            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
-                                           b_grid_desc_kbatch_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
-            }
-
             if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
                          is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
@@ -671,7 +693,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
         BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
         CGridDesc_M_N c_grid_desc_m_n_;
-        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
 
         Block2CTileMap block_2_ctile_map_;
 
@@ -731,7 +752,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                       << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float avg_time = 0.f;
 
@@ -739,6 +761,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             const BDataType* p_b_grid = arg.p_b_grid_;
             CDataType* p_e_grid       = arg.p_c_grid_;
 
+            auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm64::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.c_grid_desc_m_n_);
+
             if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
                          is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
@@ -845,7 +871,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     arg.Conv_G_,
                     arg.a_grid_desc_kbatch_k0_m_k1_,
                     arg.b_grid_desc_kbatch_k0_n_k1_,
-                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
                     arg.block_2_ctile_map_,
                     arg.compute_ptr_offset_of_batch_);
             };
@@ -893,6 +919,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             return avg_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -908,7 +936,13 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+#if DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS
+        if(arg.k_batch_ < 0)
+        {
+            return false;
+        }
+#endif
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
@@ -1023,10 +1057,27 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         }
 
         // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                                     arg.b_grid_desc_kbatch_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                                     arg.b_grid_desc_kbatch_k0_n_k1_,
+                                                     arg.c_grid_desc_m_n_,
+                                                     arg.block_2_ctile_map_);
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index 0793285dbd..735eebbdf6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -55,32 +55,35 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const index_t num_k_per_block)
 {
-#if defined(__gfx9__)
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
-    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
 
-    const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        const long_index_t a_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        const long_index_t b_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        const long_index_t e_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
 
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
-                               BGridDesc_BK0_N_K1,
-                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                               HasMainKBlockLoop,
-                               CGlobalMemoryDataOperation,
-                               TailNum>(karg.p_a_grid + a_batch_offset,
-                                        karg.p_b_grid + b_batch_offset,
-                                        karg.p_c_grid + e_batch_offset,
-                                        p_shared,
-                                        karg,
-                                        a_grid_desc_ak0_m_ak1,
-                                        b_grid_desc_bk0_n_bk1,
-                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                        k_idx);
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
+                                   BGridDesc_BK0_N_K1,
+                                   CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                   HasMainKBlockLoop,
+                                   CGlobalMemoryDataOperation,
+                                   TailNum>(karg.p_a_grid + a_batch_offset,
+                                            karg.p_b_grid + b_batch_offset,
+                                            karg.p_c_grid + e_batch_offset,
+                                            p_shared,
+                                            karg,
+                                            a_grid_desc_ak0_m_ak1,
+                                            b_grid_desc_bk0_n_bk1,
+                                            c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                            k_idx);
+    }
 #else
     ignore = karg;
     ignore = a_grid_desc_ak0_m_ak1;
@@ -113,38 +116,41 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const index_t num_k_per_block)
 {
-#if defined(__gfx9__)
-    // offset base pointer for each work-group
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
-    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // offset base pointer for each work-group
+        const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
 
-    const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        const long_index_t a_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        const long_index_t b_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        const long_index_t e_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
 
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
-                                    BGridDesc_BK0_N_K1,
-                                    CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    HasMainKBlockLoop,
-                                    CGlobalMemoryDataOperation,
-                                    TailNum>(karg.p_a_grid + a_batch_offset,
-                                             karg.p_b_grid + b_batch_offset,
-                                             karg.p_c_grid + e_batch_offset,
-                                             p_shared_0,
-                                             p_shared_1,
-                                             karg,
-                                             a_grid_desc_ak0_m_ak1,
-                                             b_grid_desc_bk0_n_bk1,
-                                             c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                             k_idx);
+        GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
+                                        BGridDesc_BK0_N_K1,
+                                        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                        HasMainKBlockLoop,
+                                        CGlobalMemoryDataOperation,
+                                        TailNum>(karg.p_a_grid + a_batch_offset,
+                                                 karg.p_b_grid + b_batch_offset,
+                                                 karg.p_c_grid + e_batch_offset,
+                                                 p_shared_0,
+                                                 p_shared_1,
+                                                 karg,
+                                                 a_grid_desc_ak0_m_ak1,
+                                                 b_grid_desc_bk0_n_bk1,
+                                                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                 k_idx);
+    }
 #else
     ignore = karg;
     ignore = a_grid_desc_ak0_m_ak1;
@@ -173,8 +179,8 @@ template <ck::index_t NDimSpatial,
           ck::index_t NPerBlock,
           ck::index_t K0PerBlock,
           ck::index_t K1,
-          ck::index_t MPerXdl,
-          ck::index_t NPerXdl,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
           ck::index_t MXdlPerWave,
           ck::index_t NXdlPerWave,
           typename ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -218,6 +224,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
     static_assert(is_same_v<OutElementwiseOperation, element_wise::PassThrough>);
 
     using DeviceOp = DeviceGroupedConvBwdWeight_Xdl_CShuffleV3;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     using ADataType = OutDataType;
     using BDataType = InDataType;
@@ -330,7 +339,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
     using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_conv_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_conv_v3<
         tensor_layout::gemm::RowMajor,
         tensor_layout::gemm::ColumnMajor,
         tensor_layout::gemm::RowMajor,
@@ -349,10 +359,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         K0PerBlock,
         K1,
         K1,
-        MPerXdl,
-        NPerXdl,
+        MPerXDL,
+        NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -377,15 +387,18 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         BlkGemmPipelineVer,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // Argument
     using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             CGridDesc_M_N{}, 1, 1));
 
     struct ActiveWorkgroupsPerCU
     {
-        ActiveWorkgroupsPerCU()
+        template <typename GridwiseGemm>
+        static int GetMaxOccupancy()
         {
             constexpr int dynamic_smem_size = 0;
             constexpr index_t minimum_occupancy =
@@ -424,7 +437,26 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                     BlockSize,
                     dynamic_smem_size));
             }
-            max_occupancy_ = std::max(1, max_occupancy);
+            return std::max(1, max_occupancy);
+        }
+
+        ActiveWorkgroupsPerCU()
+        {
+            max_occupancy_ = 1;
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    max_occupancy_ = GetMaxOccupancy<GridwiseGemm64>();
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    max_occupancy_ = GetMaxOccupancy<GridwiseGemm32>();
+                }
+            }
         }
         int max_occupancy_;
     };
@@ -492,6 +524,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                       end(a_g_n_k_wos_lengths),
                       begin(output_spatial_lengths_));
 
+#if !DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS
             if(split_k < 0)
             {
                 ck::index_t gemmM, gemmN, gemmK;
@@ -517,6 +550,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                 }
             }
             else
+#endif
             {
                 k_batch_ = split_k;
             }
@@ -556,10 +590,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
             const index_t GemmN = b_grid_desc_kbatch_k0_n_k1_.GetLength(I1);
 
             c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                GridwiseGemm64::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                     c_grid_desc_m_n_,
-                    GridwiseGemm::CalculateMBlock(GemmM),
-                    GridwiseGemm::CalculateNBlock(GemmN));
+                    GridwiseGemm64::CalculateMBlock(GemmM),
+                    GridwiseGemm64::CalculateNBlock(GemmN));
         }
 
         const ADataType* p_a_grid_;
@@ -617,7 +651,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                       << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const index_t GemmM = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
             const index_t GemmN = arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1);
@@ -1225,6 +1260,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -1240,28 +1277,69 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
 
     static bool IsSupportedArgument(const Argument& arg)
     {
+#if DISABLE_SPLIT_K_AUTODEDUCE_FOR_ONE_STAGE_KERNELS
+        if(arg.k_batch_ < 0)
+        {
+            return false;
+        }
+#endif
+
         const index_t GemmM = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
         const index_t GemmN = arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1);
         const index_t GemmK = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) *
                               arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2);
 
-        typename GridwiseGemm::Argument gemm_arg{
-            nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
-
-        const auto num_k_loop = gemm_arg.AK0 / (K0PerBlock / K1);
-        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        if(get_warp_size() == 64)
         {
-            if(num_k_loop <= GridwiseGemm::BlockwiseGemmPipe::PrefetchStages)
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                typename GridwiseGemm64::Argument gemm_arg{
+                    nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+                const auto num_k_loop = gemm_arg.AK0 / (K0PerBlock / K1);
+                if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+                {
+                    if(num_k_loop <= GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages)
+                    {
+                        return false;
+                    }
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                typename GridwiseGemm32::Argument gemm_arg{
+                    nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+                const auto num_k_loop = gemm_arg.AK0 / (K0PerBlock / K1);
+                if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+                {
+                    if(num_k_loop <= GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages)
+                    {
+                        return false;
+                    }
+                }
+            }
+            else
             {
                 return false;
             }
         }
 
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(is_gfx11_supported() && arg.k_batch_ > 1)
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> &&
            arg.k_batch_ > 1)
         {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 1448914dd3..cc8561a09f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -101,106 +101,111 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfG compute_ptr_offset_of_groups,
         const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
-#if defined(__gfx9__)
-
-    // offset base pointer for each work-group
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
-
-    const long_index_t e_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
-    const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
-    const auto& ds_n_offset     = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
-
-    const long_index_t e_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
-
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    DsPointer p_ds_grid_grp;
-
-    static constexpr index_t NumDTensor =
-        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
-
-    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_n_offset[i] + ds_group_offset[i]; });
-
-    if constexpr(isMultiA || isMultiB)
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
     {
-        AsPointer p_as_grid_grp;
-        BsPointer p_bs_grid_grp;
+        // offset base pointer for each work-group
+        const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
-        const auto& as_group_offset = compute_ptr_offset_of_groups.GetAsPtrOffset(g_idx);
+        const long_index_t e_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
+        const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+        const auto& ds_n_offset     = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
 
-        // compute_ptr_offset_of_n_ not need BatchStrideB so
-        // in case of MultiA is false but isMultiB is true
-        // BatchStrideA_ is not tuple.
-        if constexpr(isMultiA)
+        const long_index_t e_n_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+        DsPointer p_ds_grid_grp;
+
+        static constexpr index_t NumDTensor =
+            DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_n_offset[i] + ds_group_offset[i]; });
+
+        if constexpr(isMultiA || isMultiB)
         {
-            const auto& as_n_offset = compute_ptr_offset_of_n.GetAsPtrOffset(n_idx);
+            AsPointer p_as_grid_grp;
+            BsPointer p_bs_grid_grp;
 
-            static constexpr index_t NumATensor = AGridDesc_AK0_M_AK1::Size();
-            static_for<0, NumATensor, 1>{}([&](auto i) {
-                p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + as_n_offset[i];
-            });
+            const auto& as_group_offset = compute_ptr_offset_of_groups.GetAsPtrOffset(g_idx);
+
+            // compute_ptr_offset_of_n_ not need BatchStrideB so
+            // in case of MultiA is false but isMultiB is true
+            // BatchStrideA_ is not tuple.
+            if constexpr(isMultiA)
+            {
+                const auto& as_n_offset = compute_ptr_offset_of_n.GetAsPtrOffset(n_idx);
+
+                static constexpr index_t NumATensor = AGridDesc_AK0_M_AK1::Size();
+                static_for<0, NumATensor, 1>{}([&](auto i) {
+                    p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + as_n_offset[i];
+                });
+            }
+            else
+            {
+                const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx);
+                static_for<0, 1, 1>{}([&](auto i) {
+                    p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + a_n_offset;
+                });
+            }
+
+            const auto& bs_group_offset = compute_ptr_offset_of_groups.GetBsPtrOffset(g_idx);
+
+            static constexpr index_t NumBTensor = BGridDesc_BK0_N_BK1::Size();
+            static_for<0, NumBTensor, 1>{}(
+                [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_group_offset[i]; });
+
+            GridwiseGemm::template Run<HasMainKBlockLoop>(
+                p_as_grid_grp,
+                p_bs_grid_grp,
+                p_ds_grid_grp,
+                p_e_grid + e_group_offset + e_n_offset,
+                p_shared,
+                a_element_op,
+                b_element_op,
+                cde_element_op,
+                a_grid_desc_k0_m_k1,
+                b_grid_desc_k0_n_k1,
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                block_2_ctile_map);
         }
         else
         {
-            const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx);
-            static_for<0, 1, 1>{}(
-                [&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + a_n_offset; });
+            const long_index_t b_group_offset =
+                CTranspose
+                    ? amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx))
+                    : amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
+            const long_index_t a_group_offset =
+                CTranspose
+                    ? amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx))
+                    : amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
+            const long_index_t b_n_offset =
+                CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx))
+                           : 0;
+            const long_index_t a_n_offset =
+                CTranspose ? 0
+                           : amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+
+            GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+                p_as_grid + a_group_offset + a_n_offset,
+                p_bs_grid + b_group_offset + b_n_offset,
+                p_ds_grid_grp,
+                p_e_grid + e_group_offset + e_n_offset,
+                p_shared,
+                a_element_op,
+                b_element_op,
+                cde_element_op,
+                a_grid_desc_k0_m_k1,
+                b_grid_desc_k0_n_k1,
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                block_2_ctile_map);
         }
-
-        const auto& bs_group_offset = compute_ptr_offset_of_groups.GetBsPtrOffset(g_idx);
-
-        static constexpr index_t NumBTensor = BGridDesc_BK0_N_BK1::Size();
-        static_for<0, NumBTensor, 1>{}(
-            [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_group_offset[i]; });
-
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
-            p_as_grid_grp,
-            p_bs_grid_grp,
-            p_ds_grid_grp,
-            p_e_grid + e_group_offset + e_n_offset,
-            p_shared,
-            a_element_op,
-            b_element_op,
-            cde_element_op,
-            a_grid_desc_k0_m_k1,
-            b_grid_desc_k0_n_k1,
-            ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            block_2_ctile_map);
-    }
-    else
-    {
-        const long_index_t b_group_offset =
-            CTranspose
-                ? amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx))
-                : amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
-        const long_index_t a_group_offset =
-            CTranspose
-                ? amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx))
-                : amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
-        const long_index_t b_n_offset =
-            CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)) : 0;
-        const long_index_t a_n_offset =
-            CTranspose ? 0 : amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
-
-        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-            p_as_grid + a_group_offset + a_n_offset,
-            p_bs_grid + b_group_offset + b_n_offset,
-            p_ds_grid_grp,
-            p_e_grid + e_group_offset + e_n_offset,
-            p_shared,
-            a_element_op,
-            b_element_op,
-            cde_element_op,
-            a_grid_desc_k0_m_k1,
-            b_grid_desc_k0_n_k1,
-            ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            block_2_ctile_map);
     }
 #else
     ignore = p_as_grid;
@@ -316,6 +321,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                                              BComputeDataType>
 {
     using DeviceOp = DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static_assert(NumGroupsToMerge >= 1);
 
@@ -478,7 +486,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     GemmADataType, GemmBDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType,  \
         EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation,       \
         InMemoryDataOperationEnum::Set, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, \
-        KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave,                        \
+        KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_,                       \
         ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder,  \
         ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,                               \
         ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false,          \
@@ -495,7 +503,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     GemmADataType, GemmBDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, \
         EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation,      \
         NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL,  \
-        NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1,       \
+        NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1,      \
         ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,                 \
         ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,                          \
         ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM,                          \
@@ -511,7 +519,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     GemmBDataType, GemmADataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, \
         EDataType, BElementwiseOperation, AElementwiseOperation, CDEElementwiseOperation,      \
         NumGemmKPrefetchStage, BlockSize, NPerBlock, MPerBlock, KPerBlock, BK1, AK1, NPerXDL,  \
-        MPerXDL, NXdlPerWave, MXdlPerWave, BBlockTransferThreadClusterLengths_BK0_N_BK1,       \
+        MPerXDL, NXdlPerWave_, MXdlPerWave, BBlockTransferThreadClusterLengths_BK0_N_BK1,      \
         BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder,                 \
         BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector,                          \
         BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN,                          \
@@ -524,14 +532,32 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         BComputeDataType, DoElementwiseBeforeCShuffle
 
     // Use appropriate gridwise gemm
-    using GridwiseGemm = std::conditional_t<
-        isMultiA || isMultiB,
-        GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmMultiABDTemplateParameters>,
-        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
-    using GridwiseGemmCTranspose = std::conditional_t<
-        CTranspose,
-        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmCTransposeTemplateParameters>,
-        GridwiseGemm>;
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmMultipleABDBase =
+        GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmMultiABDTemplateParameters>;
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmMultipleDBase =
+        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>;
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmMultipleDCTransposeBase =
+        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmCTransposeTemplateParameters>;
+
+    using GridwiseGemm64 =
+        std::conditional_t<isMultiA || isMultiB,
+                           GridwiseGemmMultipleABDBase<math::max(NXdlPerWave64, 1)>,
+                           GridwiseGemmMultipleDBase<math::max(NXdlPerWave64, 1)>>;
+    using GridwiseGemm32 = std::conditional_t<isMultiA || isMultiB,
+                                              GridwiseGemmMultipleABDBase<NXdlPerWave32>,
+                                              GridwiseGemmMultipleDBase<NXdlPerWave32>>;
+
+    using GridwiseGemmCTranspose64 =
+        std::conditional_t<CTranspose,
+                           GridwiseGemmMultipleDCTransposeBase<math::max(NXdlPerWave64, 1)>,
+                           GridwiseGemm64>;
+    using GridwiseGemmCTranspose32 =
+        std::conditional_t<CTranspose,
+                           GridwiseGemmMultipleDCTransposeBase<NXdlPerWave32>,
+                           GridwiseGemm32>;
 
     // If ADataTypes or BDataTypes is tuple, user has to pass std::array with pointers.
     using APointers =
@@ -541,27 +567,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     // Use Tuple for the both cases for GridPointer to initialize it in Argument constructor (not
     // in initializer list what is required for single const pointer).
     using AGridPointer = remove_cvref_t<
-        decltype(GetAGridPointer < isMultiA || isMultiB, GridwiseGemm, ADataType > ())>;
+        decltype(GetAGridPointer < isMultiA || isMultiB, GridwiseGemm64, ADataType > ())>;
     using BGridPointer = remove_cvref_t<
-        decltype(GetBGridPointer < isMultiA || isMultiB, GridwiseGemm, BDataType > ())>;
+        decltype(GetBGridPointer < isMultiA || isMultiB, GridwiseGemm64, BDataType > ())>;
 
     // desc for blockwise copy
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemmCTranspose::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemmCTranspose64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
     using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemmCTranspose::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemmCTranspose64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     // block-to-e-tile map
     using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemmCTranspose::MakeDefaultBlock2ETileMap(
+        remove_cvref_t<decltype(GridwiseGemmCTranspose64::MakeDefaultBlock2ETileMap(
             EGridDesc_M_N{}))>;
     using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<NPerBlock, NPerBlock>;
 
@@ -643,6 +669,62 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     // Argument
     struct Argument : public BaseArgument
     {
+        template <typename GridwiseGemm, typename GridwiseGemmCTranspose>
+        void InitGridDesc()
+        {
+            // populate desc for Ds/E
+            if constexpr(isMultiA || isMultiB)
+            {
+                const auto as_grid_desc_ak0_m_ak1 =
+                    generate_tuple([&](auto) { return a_grid_desc_m_k_; }, Number<NumATensor>{});
+                const auto bs_grid_desc_bk0_n_bk1 =
+                    generate_tuple([&](auto) { return b_grid_desc_n_k_; }, Number<NumBTensor>{});
+
+                if(GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1,
+                                               bs_grid_desc_bk0_n_bk1,
+                                               ds_grid_desc_m_n_,
+                                               e_grid_desc_m_n_,
+                                               block_2_etile_map_))
+                {
+                    e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            e_grid_desc_m_n_);
+
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            ds_grid_desc_m_n_);
+                }
+            }
+            else
+            {
+                bool valid = false;
+                if constexpr(CTranspose)
+                {
+                    valid = GridwiseGemmCTranspose::CheckValidity(b_grid_desc_n_k_,
+                                                                  a_grid_desc_m_k_,
+                                                                  ds_grid_desc_m_n_,
+                                                                  e_grid_desc_m_n_,
+                                                                  block_2_etile_map_);
+                }
+                else
+                {
+                    valid = GridwiseGemmCTranspose::CheckValidity(a_grid_desc_m_k_,
+                                                                  b_grid_desc_n_k_,
+                                                                  ds_grid_desc_m_n_,
+                                                                  e_grid_desc_m_n_,
+                                                                  block_2_etile_map_);
+                }
+                if(valid)
+                {
+                    e_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmCTranspose::
+                        MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n_);
+
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmCTranspose::
+                        MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n_);
+                }
+            }
+        };
+
         Argument(APointers p_as,
                  BPointers p_bs,
                  const std::array<const void*, NumDTensor>& p_ds,
@@ -709,13 +791,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
               e_grid_desc_m_n_{
                   DeviceOp::MakeEGridDescriptor_M_N<ELayout>(conv_to_gemm_transformer_)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
               ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
               e_grid_desc_mblock_mperblock_nblock_nperblock_{},
               block_2_etile_map_{
-                  GridwiseGemmCTranspose::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+                  GridwiseGemmCTranspose64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               compute_ptr_offset_of_groups_{},
               compute_ptr_offset_of_n_{},
               a_element_op_{a_element_op},
@@ -822,58 +904,20 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                 e_g_n_k_wos_strides_[0] * NumGroupsToMerge;
             compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides_[1] * conv_N_per_block_;
 
-            // populate desc for Ds/E
-            if constexpr(isMultiA || isMultiB)
+            if(get_warp_size() == 64)
             {
-                const auto as_grid_desc_ak0_m_ak1 =
-                    generate_tuple([&](auto) { return a_grid_desc_m_k_; }, Number<NumATensor>{});
-                const auto bs_grid_desc_bk0_n_bk1 =
-                    generate_tuple([&](auto) { return b_grid_desc_n_k_; }, Number<NumBTensor>{});
-
-                if(GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1,
-                                               bs_grid_desc_bk0_n_bk1,
-                                               ds_grid_desc_m_n_,
-                                               e_grid_desc_m_n_,
-                                               block_2_etile_map_))
+                if constexpr(NXdlPerWave64 > 0)
                 {
-                    e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            e_grid_desc_m_n_);
-
-                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            ds_grid_desc_m_n_);
+                    InitGridDesc<GridwiseGemm64, GridwiseGemmCTranspose64>();
                 }
             }
             else
             {
-                bool valid = false;
-                if constexpr(CTranspose)
+                if constexpr(NXdlPerWave32 > 0)
                 {
-                    valid = GridwiseGemmCTranspose::CheckValidity(b_grid_desc_n_k_,
-                                                                  a_grid_desc_m_k_,
-                                                                  ds_grid_desc_m_n_,
-                                                                  e_grid_desc_m_n_,
-                                                                  block_2_etile_map_);
-                }
-                else
-                {
-                    valid = GridwiseGemmCTranspose::CheckValidity(a_grid_desc_m_k_,
-                                                                  b_grid_desc_n_k_,
-                                                                  ds_grid_desc_m_n_,
-                                                                  e_grid_desc_m_n_,
-                                                                  block_2_etile_map_);
-                }
-                if(valid)
-                {
-                    e_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmCTranspose::
-                        MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n_);
-
-                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmCTranspose::
-                        MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n_);
+                    InitGridDesc<GridwiseGemm32, GridwiseGemmCTranspose32>();
                 }
             }
-
             if constexpr(NeedTransposeKernel)
             {
                 // Use not modified base strides
@@ -959,18 +1003,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
         void Print() const
         {
+            std::cout << "AComputeDataType: " << get_type_name<AComputeDataType>()
+                      << "; BComputeDataType: " << get_type_name<BComputeDataType>()
+                      << "; EDataType: " << get_type_name<EDataType>() << std::endl;
+
             std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
             std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
             static_for<0, NumDTensor, 1>{}(
                 [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
             std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+
+            std::cout << "a grid desc" << a_grid_desc_ak0_m_ak1_ << std::endl;
+            std::cout << "b grid desc" << b_grid_desc_bk0_n_bk1_ << std::endl;
+            std::cout << "e grid desc" << e_grid_desc_mblock_mperblock_nblock_nperblock_
+                      << std::endl;
         }
 
         //  private:
         // pointers (tuple if multi AB, pointer if no)
         AGridPointer p_as_grid_;
         BGridPointer p_bs_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // for checking IsSupportedArgument()
@@ -1032,6 +1085,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
+        template <typename GridwiseGemm, typename GridwiseGemmCTranspose>
         float RunGemm(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
@@ -1153,7 +1207,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                             isMultiA,
                             isMultiB,
                             CTranspose>;
-
                         return launch_and_time_kernel(
                             stream_config,
                             kernel,
@@ -1232,10 +1285,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             }
         }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm, typename GridwiseGemmCTranspose>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float avg_time = 0.f;
-
             if constexpr(NeedTransposeKernel)
             {
                 const index_t a_grid_size =
@@ -1285,7 +1338,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                                                    a_grid_size);
             }
 
-            avg_time += RunGemm(arg, stream_config);
+            avg_time += RunGemm<GridwiseGemm, GridwiseGemmCTranspose>(arg, stream_config);
 
             if constexpr(NeedTransposeKernel)
             {
@@ -1324,6 +1377,31 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             return avg_time;
         }
 
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64, GridwiseGemmCTranspose64>(arg, stream_config);
+                }
+                else
+                {
+                    return 0;
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32, GridwiseGemmCTranspose32>(arg, stream_config);
+                }
+                else
+                {
+                    return 0;
+                }
+            }
+        }
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -1350,7 +1428,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                 return false;
             }
         }
-        if(!ck::is_xdl_supported())
+
+        if(!ck::is_xdl_wmma_supported<AComputeDataType, BComputeDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
@@ -1614,40 +1693,125 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         {
             return false;
         }
-
-        // check Gridwise GEMM
-        if constexpr(isMultiA || isMultiB)
+        if constexpr(is_same_v<AComputeDataType, ck::tf32_t> ||
+                     is_same_v<BComputeDataType, ck::tf32_t>)
         {
-            // Genarate tuples with the same descriptors
-            const auto as_grid_desc_ak0_m_ak1 =
-                generate_tuple([&](auto) { return arg.a_grid_desc_m_k_; }, Number<NumATensor>{});
-            const auto bs_grid_desc_bk0_n_bk1 =
-                generate_tuple([&](auto) { return arg.b_grid_desc_n_k_; }, Number<NumBTensor>{});
-            return GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1,
-                                               bs_grid_desc_bk0_n_bk1,
-                                               arg.ds_grid_desc_m_n_,
-                                               arg.e_grid_desc_m_n_,
-                                               arg.block_2_etile_map_);
+            if(!is_tf32_supported())
+            {
+                return false;
+            }
+            if constexpr(!is_same_v<AComputeDataType, BComputeDataType>)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "ComputeDataType for A and B should be same while using TF32"
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        // check Gridwise GEMM
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                if constexpr(isMultiA || isMultiB)
+                {
+                    // Genarate tuples with the same descriptors
+                    const auto as_grid_desc_ak0_m_ak1 = generate_tuple(
+                        [&](auto) { return arg.a_grid_desc_m_k_; }, Number<NumATensor>{});
+                    const auto bs_grid_desc_bk0_n_bk1 = generate_tuple(
+                        [&](auto) { return arg.b_grid_desc_n_k_; }, Number<NumBTensor>{});
+                    return GridwiseGemm64::CheckValidity(as_grid_desc_ak0_m_ak1,
+                                                         bs_grid_desc_bk0_n_bk1,
+                                                         arg.ds_grid_desc_m_n_,
+                                                         arg.e_grid_desc_m_n_,
+                                                         arg.block_2_etile_map_);
+                }
+                else
+                {
+                    if constexpr(CTranspose)
+                    {
+                        return GridwiseGemmCTranspose64::CheckValidity(arg.b_grid_desc_n_k_,
+                                                                       arg.a_grid_desc_m_k_,
+                                                                       arg.ds_grid_desc_m_n_,
+                                                                       arg.e_grid_desc_m_n_,
+                                                                       arg.block_2_etile_map_);
+                    }
+                    else
+                    {
+                        return GridwiseGemmCTranspose64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                                       arg.b_grid_desc_n_k_,
+                                                                       arg.ds_grid_desc_m_n_,
+                                                                       arg.e_grid_desc_m_n_,
+                                                                       arg.block_2_etile_map_);
+                    }
+                }
+            }
         }
         else
         {
-            if constexpr(CTranspose)
+
+            if constexpr(NXdlPerWave32 > 0)
             {
-                return GridwiseGemmCTranspose::CheckValidity(arg.b_grid_desc_n_k_,
-                                                             arg.a_grid_desc_m_k_,
-                                                             arg.ds_grid_desc_m_n_,
-                                                             arg.e_grid_desc_m_n_,
-                                                             arg.block_2_etile_map_);
-            }
-            else
-            {
-                return GridwiseGemmCTranspose::CheckValidity(arg.a_grid_desc_m_k_,
-                                                             arg.b_grid_desc_n_k_,
-                                                             arg.ds_grid_desc_m_n_,
-                                                             arg.e_grid_desc_m_n_,
-                                                             arg.block_2_etile_map_);
+                if constexpr(isMultiA || isMultiB)
+                {
+                    // Genarate tuples with the same descriptors
+                    const auto as_grid_desc_ak0_m_ak1 = generate_tuple(
+                        [&](auto) { return arg.a_grid_desc_m_k_; }, Number<NumATensor>{});
+                    const auto bs_grid_desc_bk0_n_bk1 = generate_tuple(
+                        [&](auto) { return arg.b_grid_desc_n_k_; }, Number<NumBTensor>{});
+                    return GridwiseGemm32::CheckValidity(as_grid_desc_ak0_m_ak1,
+                                                         bs_grid_desc_bk0_n_bk1,
+                                                         arg.ds_grid_desc_m_n_,
+                                                         arg.e_grid_desc_m_n_,
+                                                         arg.block_2_etile_map_);
+                }
+                else
+                {
+                    if constexpr(CTranspose)
+                    {
+                        return GridwiseGemmCTranspose32::CheckValidity(arg.b_grid_desc_n_k_,
+                                                                       arg.a_grid_desc_m_k_,
+                                                                       arg.ds_grid_desc_m_n_,
+                                                                       arg.e_grid_desc_m_n_,
+                                                                       arg.block_2_etile_map_);
+                    }
+                    else
+                    {
+                        return GridwiseGemmCTranspose32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                                       arg.b_grid_desc_n_k_,
+                                                                       arg.ds_grid_desc_m_n_,
+                                                                       arg.e_grid_desc_m_n_,
+                                                                       arg.block_2_etile_map_);
+                    }
+                }
             }
         }
+
+        if constexpr(is_same_v<AComputeDataType, ck::tf32_t> ||
+                     is_same_v<BComputeDataType, ck::tf32_t>)
+
+        {
+            if(!(ck::get_device_name() == "gfx942"))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "TF32 is enabled on gfx942 only" << std::endl;
+                }
+                return false;
+            }
+            if constexpr(!is_same_v<AComputeDataType, BComputeDataType>)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "ComputeDataType for A and B should be same while using TF32"
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index bb31d64a93..dd2e429a01 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -82,54 +82,57 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
                                             const ComputePtrOffset compute_ptr_offset_of_groups,
                                             const ComputePtrOffset compute_ptr_offset_of_n)
 {
-#if defined(__gfx9__)
-    // offset base pointer for each work-group
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // offset base pointer for each work-group
+        const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
-    const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
-    const auto& ds_n_offset     = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
+        const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+        const auto& ds_n_offset     = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
 
-    static constexpr index_t NumDTensor = GridwiseGemm::NumDTensor;
-    using DsGridPointer                 = typename GridwiseGemm::DsGridPointer;
-    DsGridPointer p_ds_grid_grp{};
+        static constexpr index_t NumDTensor = GridwiseGemm::NumDTensor;
+        using DsGridPointer                 = typename GridwiseGemm::DsGridPointer;
+        DsGridPointer p_ds_grid_grp{};
 
-    static_for<0, NumDTensor, 1>{}([&](auto i) {
-        p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_n_offset[i] + ds_group_offset[i];
-    });
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_n_offset[i] + ds_group_offset[i];
+        });
 
-    const long_index_t a_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
-    const long_index_t b_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
-    const long_index_t e_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
+        const long_index_t a_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
+        const long_index_t b_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
+        const long_index_t e_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
 
-    const long_index_t a_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
-    const long_index_t e_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+        const long_index_t a_n_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+        const long_index_t e_n_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
 
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    using Block2CTileMap         = typename GridwiseGemm::Block2CTileMapDefault;
-    const auto block_2_ctile_map = Block2CTileMap{karg.M, karg.N, 4};
+        using Block2CTileMap         = typename GridwiseGemm::Block2CTileMapDefault;
+        const auto block_2_ctile_map = Block2CTileMap{karg.M, karg.N, 4};
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_group_offset + a_n_offset,
-        karg.p_b_grid + b_group_offset,
-        p_ds_grid_grp,
-        karg.p_c_grid + e_group_offset + e_n_offset,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op,
-        block_2_ctile_map,
-        a_grid_desc_ak0_m_ak1,
-        b_grid_desc_bk0_n_bk1,
-        ds_grid_desc_m_n,
-        c_grid_desc_m_n);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + a_group_offset + a_n_offset,
+            karg.p_b_grid + b_group_offset,
+            p_ds_grid_grp,
+            karg.p_c_grid + e_group_offset + e_n_offset,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op,
+            block_2_ctile_map,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_m_n,
+            c_grid_desc_m_n);
+    }
 #else
     ignore = karg;
     ignore = a_grid_desc_ak0_m_ak1;
@@ -164,58 +167,61 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         const ComputePtrOffset compute_ptr_offset_of_groups,
         const ComputePtrOffset compute_ptr_offset_of_n)
 {
-#if defined(__gfx9__)
-    // offset base pointer for each work-group
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // offset base pointer for each work-group
+        const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
-    const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
-    const auto& ds_n_offset     = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
+        const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+        const auto& ds_n_offset     = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
 
-    static constexpr index_t NumDTensor = GridwiseGemm::NumDTensor;
-    using DsGridPointer                 = typename GridwiseGemm::DsGridPointer;
-    DsGridPointer p_ds_grid_grp{};
+        static constexpr index_t NumDTensor = GridwiseGemm::NumDTensor;
+        using DsGridPointer                 = typename GridwiseGemm::DsGridPointer;
+        DsGridPointer p_ds_grid_grp{};
 
-    static_for<0, NumDTensor, 1>{}([&](auto i) {
-        p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_n_offset[i] + ds_group_offset[i];
-    });
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_n_offset[i] + ds_group_offset[i];
+        });
 
-    const long_index_t a_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
-    const long_index_t b_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
-    const long_index_t e_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
+        const long_index_t a_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
+        const long_index_t b_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
+        const long_index_t e_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
 
-    const long_index_t a_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
-    const long_index_t e_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+        const long_index_t a_n_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+        const long_index_t e_n_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
 
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    using Block2CTileMap         = typename GridwiseGemm::Block2CTileMapDefault;
-    const auto block_2_ctile_map = Block2CTileMap{karg.M, karg.N, 4};
+        using Block2CTileMap         = typename GridwiseGemm::Block2CTileMapDefault;
+        const auto block_2_ctile_map = Block2CTileMap{karg.M, karg.N, 4};
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_group_offset + a_n_offset,
-        karg.p_b_grid + b_group_offset,
-        p_ds_grid_grp,
-        karg.p_c_grid + e_group_offset + e_n_offset,
-        p_shared_0,
-        p_shared_1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op,
-        block_2_ctile_map,
-        a_grid_desc_ak0_m_ak1,
-        b_grid_desc_bk0_n_bk1,
-        ds_grid_desc_m_n,
-        c_grid_desc_m_n);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + a_group_offset + a_n_offset,
+            karg.p_b_grid + b_group_offset,
+            p_ds_grid_grp,
+            karg.p_c_grid + e_group_offset + e_n_offset,
+            p_shared_0,
+            p_shared_1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op,
+            block_2_ctile_map,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_m_n,
+            c_grid_desc_m_n);
+    }
 #else
     ignore = karg;
     ignore = a_grid_desc_ak0_m_ak1;
@@ -318,6 +324,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                                              BComputeDataType>
 {
     using DeviceOp = DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr bool isMultiA   = is_detected<is_tuple, ADataType>::value;
     static constexpr bool isMultiB   = is_detected<is_tuple, BDataType>::value;
@@ -469,7 +478,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         remove_cvref_t<decltype(MakeDsGridDescriptor_M_N(dummy_conv_to_gemm_transformer))>;
 
     // Use appropriate gridwise gemm
-    using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultiD_xdl_cshuffle_v3<
         tensor_layout::gemm::RowMajor,
         tensor_layout::gemm::ColumnMajor,
         DsLayout,
@@ -493,7 +503,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -521,6 +531,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         ADataType,
         BDataType,
         DoElementwiseBeforeCShuffle>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // #undef GridwiseGemmV3TemplateParams
 
@@ -860,6 +872,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     {
         using Argument = DeviceOp::Argument;
 
+        template <typename GridwiseGemm>
         float RunGemm(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
@@ -1232,7 +1245,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             return ave_time;
         }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float avg_time = 0.f;
             if constexpr(!isMultiABD)
@@ -1288,7 +1302,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                                                a_grid_size);
                 }
 
-                avg_time += RunGemm(arg, stream_config);
+                avg_time += RunGemm<GridwiseGemm>(arg, stream_config);
 
                 // Transpose result back to NGCHW
                 if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
@@ -1330,6 +1344,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             return avg_time;
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -1373,7 +1389,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             }
         }
 
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<AComputeDataType, BComputeDataType, MPerXDL, NPerXDL>())
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
@@ -1383,7 +1399,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             }
             return false;
         }
-
         // check ConvolutionForwardSpecialization
         if constexpr(ConvForwardSpecialization ==
                      ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
@@ -1628,23 +1643,52 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         const index_t GemmK =
             arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
 
-        typename GridwiseGemm::Argument gemm_arg{nullptr,
-                                                 nullptr,
-                                                 {},
-                                                 nullptr,
-                                                 GemmM,
-                                                 GemmN,
-                                                 GemmK,
-                                                 I0,
-                                                 I0,
-                                                 {},
-                                                 I0,
-                                                 I1 /*KBatch*/,
-                                                 arg.a_element_op_,
-                                                 arg.b_element_op_,
-                                                 arg.cde_element_op_};
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                typename GridwiseGemm64::Argument gemm_arg{nullptr,
+                                                           nullptr,
+                                                           {},
+                                                           nullptr,
+                                                           GemmM,
+                                                           GemmN,
+                                                           GemmK,
+                                                           I0,
+                                                           I0,
+                                                           {},
+                                                           I0,
+                                                           I1 /*KBatch*/,
+                                                           arg.a_element_op_,
+                                                           arg.b_element_op_,
+                                                           arg.cde_element_op_};
+                return GridwiseGemm64::CheckValidity(gemm_arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                typename GridwiseGemm32::Argument gemm_arg{nullptr,
+                                                           nullptr,
+                                                           {},
+                                                           nullptr,
+                                                           GemmM,
+                                                           GemmN,
+                                                           GemmK,
+                                                           I0,
+                                                           I0,
+                                                           {},
+                                                           I0,
+                                                           I1 /*KBatch*/,
+                                                           arg.a_element_op_,
+                                                           arg.b_element_op_,
+                                                           arg.cde_element_op_};
+                return GridwiseGemm32::CheckValidity(gemm_arg);
+            }
+        }
 
-        return GridwiseGemm::CheckValidity(gemm_arg);
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
index d7859dbc46..2e71450ae6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -155,55 +155,60 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const Block2ETileMap block_2_ctile_map,
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if defined(__gfx9__)
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+        const long_index_t a_batch_offset = amd_wave_read_first_lane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        const long_index_t b_batch_offset = amd_wave_read_first_lane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        const long_index_t e_batch_offset = amd_wave_read_first_lane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
-    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
-    const auto rs_batch_offset = compute_ptr_offset_of_batch.GetRsPtrOffset(g_idx);
+        const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+        const auto rs_batch_offset = compute_ptr_offset_of_batch.GetRsPtrOffset(g_idx);
 
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    DsPointer p_ds_grid_grp;
+        DsPointer p_ds_grid_grp;
 
-    static constexpr index_t NumDTensor =
-        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+        static constexpr index_t NumDTensor =
+            DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
 
-    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-    RsPointer p_rs_grid_grp;
+        RsPointer p_rs_grid_grp;
 
-    static constexpr index_t NumRTensor = RsGridDescriptor_MBlock_MPerBlock::Size();
+        static constexpr index_t NumRTensor = RsGridDescriptor_MBlock_MPerBlock::Size();
 
-    static_for<0, NumRTensor, 1>{}(
-        [&](auto i) { p_rs_grid_grp(i) = p_rs_grid[i] + rs_batch_offset[i]; });
+        static_for<0, NumRTensor, 1>{}(
+            [&](auto i) { p_rs_grid_grp(i) = p_rs_grid[i] + rs_batch_offset[i]; });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_ds_grid_grp,
-                                                  p_e_grid + e_batch_offset,
-                                                  p_rs_grid_grp,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  qs_element_op,
-                                                  rs_element_op,
-                                                  a_grid_desc_k0_m_k1,
-                                                  b_grid_desc_k0_n_k1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                                  rs_grid_desc_mblock_mperblock,
-                                                  block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid + a_batch_offset,
+            p_b_grid + b_batch_offset,
+            p_ds_grid_grp,
+            p_e_grid + e_batch_offset,
+            p_rs_grid_grp,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            qs_element_op,
+            rs_element_op,
+            a_grid_desc_k0_m_k1,
+            b_grid_desc_k0_n_k1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            rs_grid_desc_mblock_mperblock,
+            block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -299,6 +304,9 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
                                                     QsElementwiseOperation>
 {
     using DeviceOp = DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr index_t NumDTensor = DsDataType::Size();
     static constexpr index_t NumRTensor = RsDataType::Size();
@@ -361,11 +369,10 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
                      GemmSpec == GemmSpecialization::MNKPadding)
         {
             // pad M
-            return transform_tensor_descriptor(
-                descriptor,
-                make_tuple(make_right_pad_transform(descriptor, MPad)),
-                make_tuple(Sequence<0>{}),
-                make_tuple(Sequence<0>{}));
+            return transform_tensor_descriptor(descriptor,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                               make_tuple(Sequence<0>{}),
+                                               make_tuple(Sequence<0>{}));
         }
         else
         {
@@ -429,7 +436,8 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
     using RGridDesc_M = remove_cvref_t<decltype(MakeRGridDescriptor_M<RLayout>({}, {}))>;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
         CShuffleDataType,
@@ -459,7 +467,7 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -482,15 +490,17 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
         CDEBlockTransferScalarPerVector_NPerBlock,
         RThreadTransferDstScalarPerVector_MPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
 
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+    using Block2ETileMap = typename GridwiseGemm64::DefaultBlock2ETileMap;
 
     // Argument
     struct Argument : public BaseArgument
@@ -546,13 +556,10 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
               r_grid_desc_m_{
                   DeviceOp::MakeRGridDescriptor_M<RLayout>(r_g_n_wos_lengths, r_g_n_wos_strides)},
               a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+                  GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
               b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
-              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              rs_grid_desc_mblock_mperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+                  GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              block_2_etile_map_{GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               compute_ptr_offset_of_batch_{},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
@@ -577,58 +584,40 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
             compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
             compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0];
 
-            // populate desc for Ds/E
-            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                           b_grid_desc_n_k_,
-                                           e_grid_desc_m_n_,
-                                           r_grid_desc_m_,
-                                           block_2_etile_map_))
-            {
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
 
-                // populate pointer, batch stride, desc for Ds
-                static_for<0, NumDTensor, 1>{}([&](auto i) {
-                    using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
 
-                    // D pointer
-                    p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
+                // D batch stride
+                compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0];
 
-                    // D batch stride
-                    compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0];
+                ConvToGemmFwdTransformer conv_to_gemm_transformer_d{a_g_n_c_wis_lengths,
+                                                                    a_g_n_c_wis_strides,
+                                                                    b_g_k_c_xs_lengths,
+                                                                    b_g_k_c_xs_strides,
+                                                                    ds_g_n_k_wos_lengths[i],
+                                                                    ds_g_n_k_wos_strides[i],
+                                                                    conv_filter_strides,
+                                                                    conv_filter_dilations,
+                                                                    input_left_pads,
+                                                                    input_right_pads};
 
-                    ConvToGemmFwdTransformer conv_to_gemm_transformer_d{a_g_n_c_wis_lengths,
-                                                                        a_g_n_c_wis_strides,
-                                                                        b_g_k_c_xs_lengths,
-                                                                        b_g_k_c_xs_strides,
-                                                                        ds_g_n_k_wos_lengths[i],
-                                                                        ds_g_n_k_wos_strides[i],
-                                                                        conv_filter_strides,
-                                                                        conv_filter_dilations,
-                                                                        input_left_pads,
-                                                                        input_right_pads};
+                // D desc
+                ds_grid_desc_m_n_(i) =
+                    DeviceOp::MakeEGridDescriptor_M_N<DELayout>(conv_to_gemm_transformer_d);
+            });
 
-                    // D desc
-                    ds_grid_desc_m_n_(i) =
-                        DeviceOp::MakeEGridDescriptor_M_N<DELayout>(conv_to_gemm_transformer_d);
+            // populate pointer for Rs
+            static_for<0, NumRTensor, 1>{}([&](auto i) {
+                using RDataType = remove_cvref_t<tuple_element_t<i.value, RsDataType>>;
 
-                    ds_grid_desc_mblock_mperblock_nblock_nperblock_(i) =
-                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            ds_grid_desc_m_n_(i));
-                });
-
-                // populate pointer for Rs
-                static_for<0, NumRTensor, 1>{}([&](auto i) {
-                    using RDataType = remove_cvref_t<tuple_element_t<i.value, RsDataType>>;
-
-                    // R pointer
-                    p_rs_grid_(i) = static_cast<RDataType*>(p_rs[i]);
-
-                    rs_grid_desc_mblock_mperblock_(i) =
-                        GridwiseGemm::MakeRGridDescriptor_MBlock_MPerBlock(r_grid_desc_m_);
-                });
-            }
+                // R pointer
+                p_rs_grid_(i)                                  = static_cast<RDataType*>(p_rs[i]);
+                compute_ptr_offset_of_batch_.BatchStrideRs_(i) = r_g_n_wos_strides[0];
+            });
         }
 
         void Print() const
@@ -644,9 +633,9 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
-        typename GridwiseGemm::RsGridPointer p_rs_grid_;
+        typename GridwiseGemm64::RsGridPointer p_rs_grid_;
 
         ConvToGemmFwdTransformer conv_to_gemm_transformer_;
 
@@ -660,16 +649,6 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
         // tensor descriptors for block/thread-wise copy
         AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        StaticallyIndexedArray<
-            typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-            NumDTensor>
-            ds_grid_desc_mblock_mperblock_nblock_nperblock_; // FIXME: Ds desc may be of different
-                                                             // type from E
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
-
-        StaticallyIndexedArray<typename GridwiseGemm::RGridDescriptor_MBlock_MPerBlock, NumRTensor>
-            rs_grid_desc_mblock_mperblock_;
 
         // block-to-e-tile map
         Block2ETileMap block_2_etile_map_;
@@ -703,7 +682,8 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                             arg.b_grid_desc_n_k_,
@@ -715,6 +695,32 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
                     "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
             }
 
+            StaticallyIndexedArray<
+                typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                NumDTensor>
+                ds_grid_desc_mblock_mperblock_nblock_nperblock = {};
+
+            StaticallyIndexedArray<typename GridwiseGemm64::RGridDescriptor_MBlock_MPerBlock,
+                                   NumRTensor>
+                rs_grid_desc_mblock_mperblock = {};
+
+            auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    arg.e_grid_desc_m_n_);
+
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                ds_grid_desc_mblock_mperblock_nblock_nperblock(i) =
+                    GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        arg.ds_grid_desc_m_n_(i));
+            });
+
+            // populate pointer for Rs
+            static_for<0, NumRTensor, 1>{}([&](auto i) {
+                rs_grid_desc_mblock_mperblock(i) =
+                    GridwiseGemm64::MakeRGridDescriptor_MBlock_MPerBlock(arg.r_grid_desc_m_);
+            });
+
             const index_t grid_size =
                 arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) *
                 arg.a_g_n_c_wis_lengths_[0]; // Group count
@@ -767,9 +773,9 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
                                               arg.a_g_n_c_wis_lengths_[0], // Group count
                                               arg.a_grid_desc_ak0_m_ak1_,
                                               arg.b_grid_desc_bk0_n_bk1_,
-                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.rs_grid_desc_mblock_mperblock_,
+                                              ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              rs_grid_desc_mblock_mperblock,
                                               arg.block_2_etile_map_,
                                               arg.compute_ptr_offset_of_batch_);
             };
@@ -784,6 +790,8 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
             }
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -794,7 +802,10 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         namespace ctc = tensor_layout::convolution;
-
+        if(!is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, MPerXDL>())
+        {
+            return false;
+        }
         // check device
         if(get_device_name() == "gfx908")
         {
@@ -812,9 +823,16 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
                 return false;
             }
         }
+        else if(ck::is_gfx12_supported() || ck::is_gfx11_supported())
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
+            {
+                return false;
+            }
+        }
         else
         {
-            return false;
+            // return false;
         }
 
         // check ConvolutionForwardSpecialization
@@ -952,11 +970,29 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
         }
 
         // check Gridwise GEMM
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                           arg.b_grid_desc_n_k_,
-                                           arg.e_grid_desc_m_n_,
-                                           arg.r_grid_desc_m_,
-                                           arg.block_2_etile_map_);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.r_grid_desc_m_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(arg.a_grid_desc_m_k_,
+                                                     arg.b_grid_desc_n_k_,
+                                                     arg.e_grid_desc_m_n_,
+                                                     arg.r_grid_desc_m_,
+                                                     arg.block_2_etile_map_);
+            }
+        }
+        return false;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
index 8f3feee1c1..25afe46690 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -52,67 +52,70 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffset compute_ptr_offset_of_groups,
         const ComputePtrOffset compute_ptr_offset_of_n)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    const index_t block_id_x = __builtin_amdgcn_readfirstlane(blockIdx.x);
-    const index_t g_idx      = __builtin_amdgcn_readfirstlane(blockIdx.y);
-    const index_t n_idx      = __builtin_amdgcn_readfirstlane(blockIdx.z);
-
-    const long_index_t a_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
-    const long_index_t b_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
-    const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
-    const long_index_t e_group_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
-
-    const long_index_t a_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
-    const auto& ds_n_offset = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
-    const long_index_t e_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
-
-    index_t left     = 0;
-    index_t right    = gemms_count;
-    index_t group_id = index_t((left + right) / 2);
-    while((!(block_id_x >= gemm_desc_kernel_args[group_id].BlockStart_ &&
-             block_id_x < gemm_desc_kernel_args[group_id].BlockEnd_)) &&
-          left <= right)
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
     {
-        if(block_id_x < gemm_desc_kernel_args[group_id].BlockStart_)
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+        const index_t block_id_x = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const index_t g_idx      = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const index_t n_idx      = __builtin_amdgcn_readfirstlane(blockIdx.z);
+
+        const long_index_t a_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
+        const long_index_t b_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
+        const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+        const long_index_t e_group_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
+
+        const long_index_t a_n_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+        const auto& ds_n_offset = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
+        const long_index_t e_n_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
+        index_t left     = 0;
+        index_t right    = gemms_count;
+        index_t group_id = index_t((left + right) / 2);
+        while((!(block_id_x >= gemm_desc_kernel_args[group_id].BlockStart_ &&
+                 block_id_x < gemm_desc_kernel_args[group_id].BlockEnd_)) &&
+              left <= right)
         {
-            right = group_id;
+            if(block_id_x < gemm_desc_kernel_args[group_id].BlockStart_)
+            {
+                right = group_id;
+            }
+            else
+            {
+                left = group_id;
+            }
+            group_id = index_t((left + right) / 2);
         }
-        else
-        {
-            left = group_id;
-        }
-        group_id = index_t((left + right) / 2);
+
+        using DsPointer = decltype(gemm_desc_kernel_args[Number<0>{}].ds_ptr_);
+        DsPointer p_ds_grid_grp;
+        static constexpr index_t NumDTensor = DsPointer::Size();
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            p_ds_grid_grp(i) =
+                gemm_desc_kernel_args[group_id].ds_ptr_[i] + ds_group_offset[i] + ds_n_offset[i];
+        });
+
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+            gemm_desc_kernel_args[group_id].a_ptr_ + a_group_offset + a_n_offset,
+            gemm_desc_kernel_args[group_id].b_ptr_ + b_group_offset,
+            p_ds_grid_grp,
+            gemm_desc_kernel_args[group_id].e_ptr_ + e_group_offset + e_n_offset,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            c_element_op,
+            gemm_desc_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
+            gemm_desc_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
+            gemm_desc_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+            gemm_desc_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            gemm_desc_kernel_args[group_id].block_2_etile_map_);
     }
-
-    using DsPointer = decltype(gemm_desc_kernel_args[Number<0>{}].ds_ptr_);
-    DsPointer p_ds_grid_grp;
-    static constexpr index_t NumDTensor = DsPointer::Size();
-    static_for<0, NumDTensor, 1>{}([&](auto i) {
-        p_ds_grid_grp(i) =
-            gemm_desc_kernel_args[group_id].ds_ptr_[i] + ds_group_offset[i] + ds_n_offset[i];
-    });
-
-    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-        gemm_desc_kernel_args[group_id].a_ptr_ + a_group_offset + a_n_offset,
-        gemm_desc_kernel_args[group_id].b_ptr_ + b_group_offset,
-        p_ds_grid_grp,
-        gemm_desc_kernel_args[group_id].e_ptr_ + e_group_offset + e_n_offset,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        c_element_op,
-        gemm_desc_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
-        gemm_desc_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
-        gemm_desc_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-        gemm_desc_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
-        gemm_desc_kernel_args[group_id].block_2_etile_map_);
 #else
     ignore = gemm_desc_kernel_args;
     ignore = gemms_count;
@@ -199,6 +202,9 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
                                              BComputeDataType>
 {
     using DeviceOp = DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr index_t NumDTensor  = DsDataType::Size();
     static constexpr index_t MaxGemmsNum = 32;
@@ -412,25 +418,28 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,                \
         AComputeDataType, DoElementwiseBeforeCShuffle
     // Use appropriate gridwise gemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>;
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>;
+    using GridwiseGemm64   = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32   = GridwiseGemmBase<NXdlPerWave32>;
 
     // desc for blockwise copy
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     // block-to-e-tile map
     using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
     // Structure for each gemm(conv)
     struct GemmArgs
     {
@@ -455,6 +464,43 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
     // Argument
     struct Argument : public BaseArgument
     {
+        template <typename GridwiseGemm, typename DsGridDesc_M_N_, typename EGridDescriptor_M_N_>
+        void init_gemm_args(const ADataType* a_ptr,
+                            const BDataType* b_ptr,
+                            DsPointer ds_ptr,
+                            EDataType* e_ptr,
+                            const AGridDesc_M_K& a_grid_desc_m_k,
+                            const BGridDesc_N_K& b_grid_desc_n_k,
+                            const DsGridDesc_M_N_& ds_grid_desc_m_n,
+                            const EGridDescriptor_M_N_& e_grid_desc_m_n,
+                            const Block2ETileMap& block_2_etile_map,
+                            index_t BlockStart,
+                            index_t BlockEnd)
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                           b_grid_desc_n_k,
+                                           ds_grid_desc_m_n,
+                                           e_grid_desc_m_n,
+                                           block_2_etile_map))
+            {
+                gemm_desc_kernel_args_(valid_gemms_count_) =
+                    GemmArgs{a_ptr,
+                             b_ptr,
+                             ds_ptr,
+                             e_ptr,
+                             GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k),
+                             GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k),
+                             GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                 ds_grid_desc_m_n),
+                             GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                 e_grid_desc_m_n),
+                             block_2_etile_map,
+                             BlockStart,
+                             BlockEnd};
+
+                valid_gemms_count_++;
+            }
+        }
         Argument(const void* p_a,
                  const void* p_b,
                  const std::array<const void*, NumDTensor>& p_ds,
@@ -543,7 +589,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
                         generate_tuple([&](auto) { return e_grid_desc_m_n; }, Number<NumDTensor>{});
 
                     const auto block_2_etile_map =
-                        GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+                        GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
 
                     const index_t grid_size_grp =
                         block_2_etile_map.CalculateGridSize(e_grid_desc_m_n);
@@ -553,28 +599,39 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 
                     grid_size_ += grid_size_grp;
 
-                    if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                                   b_grid_desc_n_k,
-                                                   ds_grid_desc_m_n,
-                                                   e_grid_desc_m_n,
-                                                   block_2_etile_map))
+                    if(get_warp_size() == 64)
                     {
-                        gemm_desc_kernel_args_(valid_gemms_count_) = GemmArgs{
-                            a_grid_ptrs[i],
-                            static_cast<const BDataType*>(p_b),
-                            ds_grid_ptrs[i],
-                            c_grid_ptrs[i],
-                            GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k),
-                            GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k),
-                            GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                ds_grid_desc_m_n),
-                            GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                e_grid_desc_m_n),
-                            block_2_etile_map,
-                            BlockStart,
-                            BlockEnd};
-
-                        valid_gemms_count_++;
+                        if constexpr(NXdlPerWave64 > 0)
+                        {
+                            init_gemm_args<GridwiseGemm64>(a_grid_ptrs[i],
+                                                           static_cast<const BDataType*>(p_b),
+                                                           ds_grid_ptrs[i],
+                                                           c_grid_ptrs[i],
+                                                           a_grid_desc_m_k,
+                                                           b_grid_desc_n_k,
+                                                           ds_grid_desc_m_n,
+                                                           e_grid_desc_m_n,
+                                                           block_2_etile_map,
+                                                           BlockStart,
+                                                           BlockEnd);
+                        }
+                    }
+                    else
+                    {
+                        if constexpr(NXdlPerWave32 > 0)
+                        {
+                            init_gemm_args<GridwiseGemm32>(a_grid_ptrs[i],
+                                                           static_cast<const BDataType*>(p_b),
+                                                           ds_grid_ptrs[i],
+                                                           c_grid_ptrs[i],
+                                                           a_grid_desc_m_k,
+                                                           b_grid_desc_n_k,
+                                                           ds_grid_desc_m_n,
+                                                           e_grid_desc_m_n,
+                                                           block_2_etile_map,
+                                                           BlockStart,
+                                                           BlockEnd);
+                        }
                     }
                 }
                 // N is the same for all convs
@@ -649,7 +706,10 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const DeviceOp::Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+
+        using Argument = DeviceOp::Argument;
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -703,6 +763,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
             }
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -754,11 +816,10 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
                 return false;
             }
         }
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<AComputeDataType, BComputeDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         // check ConvolutionForwardSpecialization
         if constexpr(ConvForwardSpecialization ==
                      ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
index 764daf1750..f6ec0908eb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -45,94 +45,97 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                      const BElementwiseOperation b_element_op,
                                      const CDEElementwiseOperation cde_element_op)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    const index_t KBatch = 1;
-
-    const index_t block_id = get_block_1d_id();
-
-    const auto gemm_desc_ptr =
-        reinterpret_cast<const GemmDesc*>(cast_pointer_to_generic_address_space(gemm_descs_const));
-
-    const index_t group_id = block_id / grid_size_grp;
-
-    if(group_id >= group_count)
-        return;
-
-    const index_t M = gemm_desc_ptr[group_id].M;
-    const index_t N = gemm_desc_ptr[group_id].N;
-    const index_t K = gemm_desc_ptr[group_id].K;
-
-    if(M == 0 || N == 0 || K == 0)
-        return;
-
-    const auto StrideAs = gemm_desc_ptr[group_id].StrideAs;
-    const auto StrideBs = gemm_desc_ptr[group_id].StrideBs;
-    const auto StrideDs = gemm_desc_ptr[group_id].StrideDs;
-    const auto StrideE  = gemm_desc_ptr[group_id].StrideE;
-
-    const auto e_grid_desc_m_n =
-        GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(M, N, StrideE);
-
-    const index_t BlockStart = group_id * grid_size_grp;
-
-    const auto local_b2e_tile_map = Block2ETileMap{e_grid_desc_m_n, KBatch};
-
-    const auto local_grid_size = local_b2e_tile_map.CalculateGridSize(e_grid_desc_m_n);
-
-    constexpr auto NumATensor = GridwiseGemm::AsGridPointer::Size();
-    constexpr auto NumBTensor = GridwiseGemm::BsGridPointer::Size();
-    constexpr auto NumDTensor = GridwiseGemm::DsGridPointer::Size();
-
-    typename GridwiseGemm::AsGridPointer p_as_grid_;
-    typename GridwiseGemm::BsGridPointer p_bs_grid_;
-    typename GridwiseGemm::DsGridPointer p_ds_grid_;
-
-    static_for<0, NumATensor, 1>{}([&](auto i) {
-        using ADataType = remove_cvref_t<decltype(p_as_grid_(i))>;
-        p_as_grid_(i)   = static_cast<ADataType>(gemm_desc_ptr[group_id].p_as_grid[i]);
-    });
-
-    static_for<0, NumBTensor, 1>{}([&](auto i) {
-        using BDataType = remove_cvref_t<decltype(p_bs_grid_(i))>;
-        p_bs_grid_(i)   = static_cast<BDataType>(gemm_desc_ptr[group_id].p_bs_grid[i]);
-    });
-
-    static_for<0, NumDTensor, 1>{}([&](auto i) {
-        using DDataType = remove_cvref_t<decltype(p_ds_grid_(i))>;
-        p_ds_grid_(i)   = static_cast<DDataType>(gemm_desc_ptr[group_id].p_ds_grid[i]);
-    });
-
-    index_t id_off   = 0;
-    index_t id_local = get_block_1d_id() - BlockStart;
-
-    while(id_local < local_grid_size)
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<EGlobalMemoryDataOperation>())
     {
-        const auto block_2_etile_map =
-            GroupedGemmBlock2ETileMap(local_b2e_tile_map, BlockStart, id_off);
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-        GridwiseGemm::
-            template Run<HasMainKBlockLoop, GemmSpec, AsLayout, BsLayout, DsLayout, ELayout>(
-                p_as_grid_,
-                p_bs_grid_,
-                p_ds_grid_,
-                gemm_desc_ptr[group_id].p_e_grid,
-                p_shared,
-                a_element_op,
-                b_element_op,
-                cde_element_op,
-                M,
-                N,
-                K,
-                StrideAs,
-                StrideBs,
-                StrideDs,
-                StrideE,
-                block_2_etile_map);
+        const index_t KBatch = 1;
 
-        id_off += grid_size_grp;
-        id_local += grid_size_grp;
+        const index_t block_id = get_block_1d_id();
+
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmDesc*>(
+            cast_pointer_to_generic_address_space(gemm_descs_const));
+
+        const index_t group_id = block_id / grid_size_grp;
+
+        if(group_id >= group_count)
+            return;
+
+        const index_t M = gemm_desc_ptr[group_id].M;
+        const index_t N = gemm_desc_ptr[group_id].N;
+        const index_t K = gemm_desc_ptr[group_id].K;
+
+        if(M == 0 || N == 0 || K == 0)
+            return;
+
+        const auto StrideAs = gemm_desc_ptr[group_id].StrideAs;
+        const auto StrideBs = gemm_desc_ptr[group_id].StrideBs;
+        const auto StrideDs = gemm_desc_ptr[group_id].StrideDs;
+        const auto StrideE  = gemm_desc_ptr[group_id].StrideE;
+
+        const auto e_grid_desc_m_n =
+            GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(M, N, StrideE);
+
+        const index_t BlockStart = group_id * grid_size_grp;
+
+        const auto local_b2e_tile_map = Block2ETileMap{e_grid_desc_m_n, KBatch};
+
+        const auto local_grid_size = local_b2e_tile_map.CalculateGridSize(e_grid_desc_m_n);
+
+        constexpr auto NumATensor = GridwiseGemm::AsGridPointer::Size();
+        constexpr auto NumBTensor = GridwiseGemm::BsGridPointer::Size();
+        constexpr auto NumDTensor = GridwiseGemm::DsGridPointer::Size();
+
+        typename GridwiseGemm::AsGridPointer p_as_grid_;
+        typename GridwiseGemm::BsGridPointer p_bs_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+
+        static_for<0, NumATensor, 1>{}([&](auto i) {
+            using ADataType = remove_cvref_t<decltype(p_as_grid_(i))>;
+            p_as_grid_(i)   = static_cast<ADataType>(gemm_desc_ptr[group_id].p_as_grid[i]);
+        });
+
+        static_for<0, NumBTensor, 1>{}([&](auto i) {
+            using BDataType = remove_cvref_t<decltype(p_bs_grid_(i))>;
+            p_bs_grid_(i)   = static_cast<BDataType>(gemm_desc_ptr[group_id].p_bs_grid[i]);
+        });
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DDataType = remove_cvref_t<decltype(p_ds_grid_(i))>;
+            p_ds_grid_(i)   = static_cast<DDataType>(gemm_desc_ptr[group_id].p_ds_grid[i]);
+        });
+
+        index_t id_off   = 0;
+        index_t id_local = get_block_1d_id() - BlockStart;
+
+        while(id_local < local_grid_size)
+        {
+            const auto block_2_etile_map =
+                GroupedGemmBlock2ETileMap(local_b2e_tile_map, BlockStart, id_off);
+
+            GridwiseGemm::
+                template Run<HasMainKBlockLoop, GemmSpec, AsLayout, BsLayout, DsLayout, ELayout>(
+                    p_as_grid_,
+                    p_bs_grid_,
+                    p_ds_grid_,
+                    gemm_desc_ptr[group_id].p_e_grid,
+                    p_shared,
+                    a_element_op,
+                    b_element_op,
+                    cde_element_op,
+                    M,
+                    N,
+                    K,
+                    StrideAs,
+                    StrideBs,
+                    StrideDs,
+                    StrideE,
+                    block_2_etile_map);
+
+            id_off += grid_size_grp;
+            id_local += grid_size_grp;
+        }
     }
 #else
     ignore = gemm_descs_const;
@@ -203,6 +206,9 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
                                               CDEElementwiseOperation>
 {
     using DeviceOp = DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr index_t NumATensor = AsDataType::Size();
     static constexpr index_t NumBTensor = BsDataType::Size();
@@ -215,7 +221,8 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
     static constexpr index_t NumGemmKPrefetchStage = 1;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleABD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleABD_xdl_cshuffle<
         AsDataType,
         BsDataType,
         ComputeType,
@@ -237,7 +244,7 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -259,7 +266,8 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
-
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
     template <typename UnderlyingBlockToCTileMap>
     struct OffsettedBlockToCTileMapMLoops
     {
@@ -508,7 +516,7 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
                     [&](auto j) { StrideDs[j] = gemm_descs[i].stride_Ds_[j]; });
 
                 const auto e_grid_desc_m_n =
-                    GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
+                    GridwiseGemm64::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
                         AverM, N, StrideE);
 
                 // block-to-e-tile map
@@ -547,7 +555,7 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
             }
 
             const auto e_grid_desc_sum_m_n =
-                GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
+                GridwiseGemm64::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
                     sum_of_m, gemm_desc_kernel_arg_[0].N_, gemm_desc_kernel_arg_[0].StrideE_);
 
             const auto local_b2c_tile_map = Block2ETileMap{e_grid_desc_sum_m_n, 1};
@@ -581,7 +589,8 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             bool has_main_k_block_loop = true;
 
@@ -667,6 +676,8 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
index 7b5dd55a8f..4188bf537d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -91,6 +91,9 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                                      CDEElementwiseOperation>
 {
     using DeviceOp = DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr index_t NumDTensor = DsDataType::Size();
 
@@ -105,7 +108,8 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
     using WorkspaceDataType = float;
 
     // First stage GridwiseGEMM kernel.
-    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
         BlockSize,
         ADataType,
         BDataType,
@@ -126,7 +130,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         NPerXDL,
         AK1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_KBatch_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -150,7 +154,8 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         LoopSched,
         PipelineVer,
         ComputeDataType>;
-
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
     template <typename ELay>
     static auto MakeEGridDescriptor_M_N(index_t M, index_t N, index_t StrideE)
     {
@@ -220,8 +225,8 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
             Number<NumDTensor + 1>{});
     }
 
-    using CGridDesc_M_N  = typename GridwiseGemm::CGridDesc_M_N;
-    using EGridDesc_M_N  = typename GridwiseGemm::CGridDesc_M_N;
+    using CGridDesc_M_N  = typename GridwiseGemm64::CGridDesc_M_N;
+    using EGridDesc_M_N  = typename GridwiseGemm64::CGridDesc_M_N;
     using DsGridDesc_M_N = decltype(MakeDsGridDescriptor_M_N({}, {}, {}));
     using DsGridPointer  = decltype(MakeDsGridPointer());
     using CDGridDesc_M_N = decltype(concat_tuple(ck::Tuple<CGridDesc_M_N>{}, DsGridDesc_M_N{}));
@@ -258,7 +263,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
     // Block2CTileMap configuration parameter.
     static constexpr index_t B2E_M01 = 8;
     using GroupedGemmBlock2ETileMap  = OffsettedBlockToCTileMap<Block2ETileMapKSplit>;
-    using GemmKernelArgument         = typename GridwiseGemm::Argument;
+    using GemmKernelArgument         = typename GridwiseGemm64::Argument;
 
     struct GemmTransKernelArg
     {
@@ -355,12 +360,13 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                 const index_t stride_b = gemm_descs[i].stride_B_;
                 const index_t stride_e = gemm_descs[i].stride_C_;
 
-                const index_t m_padded  = GridwiseGemm::CalculateMPadded(M);
-                const index_t n_padded  = GridwiseGemm::CalculateNPadded(N);
-                const index_t k_padded  = GridwiseGemm::CalculateKPadded(K, K_BATCH);
-                const index_t k0_padded = GridwiseGemm::CalculateK0Padded(K, K_BATCH);
+                const index_t m_padded  = GridwiseGemm64::CalculateMPadded(M);
+                const index_t n_padded  = GridwiseGemm64::CalculateNPadded(N);
+                const index_t k_padded  = GridwiseGemm64::CalculateKPadded(K, K_BATCH);
+                const index_t k0_padded = GridwiseGemm64::CalculateK0Padded(K, K_BATCH);
 
-                const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(M, N, stride_e);
+                const auto c_grid_desc_m_n =
+                    GridwiseGemm64::MakeCGridDescriptor_M_N(M, N, stride_e);
 
                 DsGridDesc_M_N ds_grid_desc_m_n;
                 DsGridPointer p_ds_grid;
@@ -441,11 +447,11 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
             {
                 auto& karg = gemm_kernel_args_[i].karg_;
 
-                const index_t k_padded  = GridwiseGemm::CalculateKPadded(karg.K, K_BATCH);
-                const index_t k0_padded = GridwiseGemm::CalculateK0Padded(karg.K, K_BATCH);
+                const index_t k_padded  = GridwiseGemm64::CalculateKPadded(karg.K, K_BATCH);
+                const index_t k0_padded = GridwiseGemm64::CalculateK0Padded(karg.K, K_BATCH);
 
                 const auto c_grid_desc_m_n =
-                    GridwiseGemm::MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
+                    GridwiseGemm64::MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
 
                 const auto local_b2c_tile_map =
                     Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
@@ -565,13 +571,14 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         ///
         /// @return     The average kernel execution time (if time measurement is enabled.)
         ///
+        template <typename GridwiseGemm>
         float Run(const Argument& arg,
                   void* dev_gemm_args,
                   void* dev_gemm_workspace,
                   const StreamConfig& stream_config = StreamConfig{})
         {
             auto [all_have_kbatch_gt_one, all_have_main_k_block_loop] =
-                CheckArgument(arg, stream_config);
+                CheckArgument<GridwiseGemm>(arg, stream_config);
 
             if(dev_gemm_args == nullptr)
             {
@@ -593,13 +600,13 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
 
             if(all_have_main_k_block_loop)
             {
-                ave_time =
-                    DispatchKernel<true>(arg, dev_gemm_args, dev_gemm_workspace, stream_config);
+                ave_time = DispatchKernel<GridwiseGemm, true>(
+                    arg, dev_gemm_args, dev_gemm_workspace, stream_config);
             }
             else
             {
-                ave_time =
-                    DispatchKernel<false>(arg, dev_gemm_args, dev_gemm_workspace, stream_config);
+                ave_time = DispatchKernel<GridwiseGemm, false>(
+                    arg, dev_gemm_args, dev_gemm_workspace, stream_config);
             }
 
             return ave_time;
@@ -619,7 +626,8 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         ///
         /// @return     The average kernel execution time (if time measurement is enabled.)
         ///
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(arg.p_dev_gemm_kargs_ == nullptr)
             {
@@ -637,9 +645,11 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                 throw std::runtime_error(err.str());
             }
 
-            return Run(arg, arg.p_dev_gemm_kargs_, arg.p_workspace_, stream_config);
+            return Run<GridwiseGemm>(arg, arg.p_dev_gemm_kargs_, arg.p_workspace_, stream_config);
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -647,6 +657,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         }
 
         private:
+        template <typename GridwiseGemm>
         auto CheckArgument(const Argument& arg, const StreamConfig& stream_config) const
         {
             bool all_have_kbatch_gt_one, all_have_main_k_block_loop;
@@ -670,7 +681,8 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
 
             for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
             {
-                const auto& gemm_arg = arg.gemm_kernel_args_[i].karg_;
+                const auto& gemm_arg = reinterpret_cast<const typename GridwiseGemm::Argument&>(
+                    arg.gemm_kernel_args_[i].karg_);
                 if(stream_config.log_level_ > 0)
                 {
                     gemm_arg.Print();
@@ -721,7 +733,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
             return std::make_tuple(all_have_kbatch_gt_one, all_have_main_k_block_loop);
         }
 
-        template <bool HasMainKBlockLoop>
+        template <typename GridwiseGemm, bool HasMainKBlockLoop>
         float DispatchKernel(const Argument& arg,
                              void* dev_gemm_kargs,
                              void* dev_gemm_workspace,
@@ -818,11 +830,10 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if((ck::type_convert<ck::index_t>(arg.gemm_kernel_args_.size()) +
             arg.skipped_group_count_) != arg.group_count_)
         {
@@ -836,11 +847,27 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         }
 
         bool supported = true;
+        bool isWave64  = get_warp_size() == 64;
         for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
         {
             const auto& gemm_arg = arg.gemm_kernel_args_[i].karg_;
+            bool group_arg_valid = false;
+            if(isWave64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    group_arg_valid = GridwiseGemm64::CheckValidity(gemm_arg);
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    group_arg_valid = GridwiseGemm32::CheckValidity(
+                        reinterpret_cast<const typename GridwiseGemm32::Argument&>(gemm_arg));
+                }
+            }
 
-            bool group_arg_valid = GridwiseGemm::CheckValidity(gemm_arg);
             if(not group_arg_valid)
             {
                 if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
index 70a395f2f7..d8d688aa06 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -68,126 +68,91 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                        const BElementwiseOperation b_element_op,
                                        const CDEElementwiseOperation cde_element_op)
 {
-#if defined(__gfx9__)
-
-    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
-    __shared__ uint8_t p_shared[shared_size];
-    __shared__ uint8_t p_shared1[shared_size];
-
-    const auto gemm_desc_ptr =
-        reinterpret_cast<const GemmDesc*>(cast_pointer_to_generic_address_space(gemm_descs_const));
-
-    constexpr auto NumDTensor = DsDataType::Size();
-    index_t tile_id           = get_block_1d_id();
-    index_t tile_offset       = 0;
-    index_t group_id          = -1;
-    index_t group_offset      = 0;
-    index_t grid_size_grp     = 0;
-
-    index_t gemm_tile_id_start = 0;
-    index_t gemm_tile_id_end   = 0;
-
-    index_t M = 0, N = 0, K = 0;
-
-    auto b2c_tile_map = OffsettedBlockToCTileMap(LocalBlock2ETileMap(1, 1), 1, 1);
-
-    do
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
     {
-        // Find corresponding GEMM group for our tile
-        while(!(tile_id >= gemm_tile_id_start && tile_id < gemm_tile_id_end) &&
-              group_id < group_count)
+        constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+        __shared__ uint8_t p_shared[shared_size];
+        __shared__ uint8_t p_shared1[shared_size];
+
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmDesc*>(
+            cast_pointer_to_generic_address_space(gemm_descs_const));
+
+        constexpr auto NumDTensor = DsDataType::Size();
+        index_t tile_id           = get_block_1d_id();
+        index_t tile_offset       = 0;
+        index_t group_id          = -1;
+        index_t group_offset      = 0;
+        index_t grid_size_grp     = 0;
+
+        index_t gemm_tile_id_start = 0;
+        index_t gemm_tile_id_end   = 0;
+
+        index_t M = 0, N = 0, K = 0;
+
+        auto b2c_tile_map = OffsettedBlockToCTileMap(LocalBlock2ETileMap(1, 1), 1, 1);
+
+        do
         {
-            group_offset += grid_size_grp;
-            group_id++;
-
-            if(group_id >= group_count)
-                return;
-
-            M = gemm_desc_ptr[group_id].M;
-            N = gemm_desc_ptr[group_id].N;
-            K = gemm_desc_ptr[group_id].K;
-
-            if(M == 0 || N == 0 || K == 0)
+            // Find corresponding GEMM group for our tile
+            while(!(tile_id >= gemm_tile_id_start && tile_id < gemm_tile_id_end) &&
+                  group_id < group_count)
             {
-                grid_size_grp = 0;
-                continue;
-            }
+                group_offset += grid_size_grp;
+                group_id++;
 
-            b2c_tile_map =
-                OffsettedBlockToCTileMap(LocalBlock2ETileMap(M, N, 4), group_offset, tile_offset);
-            grid_size_grp = b2c_tile_map.CalculateGridSize(M, N);
+                if(group_id >= group_count)
+                    return;
 
-            gemm_tile_id_start = group_offset;
-            gemm_tile_id_end   = group_offset + grid_size_grp;
-        }
+                M = gemm_desc_ptr[group_id].M;
+                N = gemm_desc_ptr[group_id].N;
+                K = gemm_desc_ptr[group_id].K;
 
-        using DsGridPointer = decltype(GridwiseGemm::MakeDsGridPointer());
-        DsGridPointer p_ds_grid;
-
-        static_for<0, NumDTensor, 1>{}([&](auto i) {
-            using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
-            p_ds_grid(i)    = static_cast<const DDataType*>(gemm_desc_ptr[group_id].p_ds_grid[i]);
-        });
-
-        static constexpr index_t kbatch  = 1;
-        static constexpr index_t k_grain = kbatch * KPerBlock;
-        index_t K_split                  = (K + k_grain - 1) / k_grain * KPerBlock;
-
-        const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
-
-        // Update tile offset if we have moved within group
-        b2c_tile_map.UpdateTileOffset(tile_offset);
-
-        using Problem = typename GridwiseGemm::Problem;
-        auto problem  = Problem(gemm_desc_ptr[group_id].M,
-                               gemm_desc_ptr[group_id].N,
-                               gemm_desc_ptr[group_id].K,
-                               gemm_desc_ptr[group_id].StrideA,
-                               gemm_desc_ptr[group_id].StrideB,
-                               gemm_desc_ptr[group_id].StrideDs,
-                               gemm_desc_ptr[group_id].StrideE,
-                               kbatch);
-
-        if(has_main_k_block_loop)
-        {
-            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
-                         BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
-            {
-                GridwiseGemm::template Run<OffsettedBlockToCTileMap,
-                                           true,
-                                           InMemoryDataOperationEnum::Set,
-                                           TailNumber::Full>(
-                    static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
-                    static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
-                    p_ds_grid,
-                    static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
-                    static_cast<void*>(p_shared),
-                    problem,
-                    a_element_op,
-                    b_element_op,
-                    cde_element_op,
-                    b2c_tile_map);
-            }
-            else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-            {
-                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                if(M == 0 || N == 0 || K == 0)
                 {
-                    GridwiseGemm::template Run<OffsettedBlockToCTileMap,
-                                               true,
-                                               InMemoryDataOperationEnum::Set,
-                                               TailNumber::One>(
-                        static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
-                        static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
-                        p_ds_grid,
-                        static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
-                        static_cast<void*>(p_shared),
-                        problem,
-                        a_element_op,
-                        b_element_op,
-                        cde_element_op,
-                        b2c_tile_map);
+                    grid_size_grp = 0;
+                    continue;
                 }
-                else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full)
+
+                b2c_tile_map = OffsettedBlockToCTileMap(
+                    LocalBlock2ETileMap(M, N, 4), group_offset, tile_offset);
+                grid_size_grp = b2c_tile_map.CalculateGridSize(M, N);
+
+                gemm_tile_id_start = group_offset;
+                gemm_tile_id_end   = group_offset + grid_size_grp;
+            }
+
+            using DsGridPointer = decltype(GridwiseGemm::MakeDsGridPointer());
+            DsGridPointer p_ds_grid;
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                p_ds_grid(i) = static_cast<const DDataType*>(gemm_desc_ptr[group_id].p_ds_grid[i]);
+            });
+
+            static constexpr index_t kbatch  = 1;
+            static constexpr index_t k_grain = kbatch * KPerBlock;
+            index_t K_split                  = (K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            // Update tile offset if we have moved within group
+            b2c_tile_map.UpdateTileOffset(tile_offset);
+
+            using Problem = typename GridwiseGemm::Problem;
+            auto problem  = Problem(gemm_desc_ptr[group_id].M,
+                                   gemm_desc_ptr[group_id].N,
+                                   gemm_desc_ptr[group_id].K,
+                                   gemm_desc_ptr[group_id].StrideA,
+                                   gemm_desc_ptr[group_id].StrideB,
+                                   gemm_desc_ptr[group_id].StrideDs,
+                                   gemm_desc_ptr[group_id].StrideE,
+                                   kbatch);
+
+            if(has_main_k_block_loop)
+            {
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                 {
                     GridwiseGemm::template Run<OffsettedBlockToCTileMap,
                                                true,
@@ -204,15 +169,14 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                         cde_element_op,
                         b2c_tile_map);
                 }
-
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
                 {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
                     {
                         GridwiseGemm::template Run<OffsettedBlockToCTileMap,
                                                    true,
                                                    InMemoryDataOperationEnum::Set,
-                                                   TailNumber::Two>(
+                                                   TailNumber::One>(
                             static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
                             static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
                             p_ds_grid,
@@ -224,16 +188,12 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             cde_element_op,
                             b2c_tile_map);
                     }
-                }
-
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Three)
+                    else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full)
                     {
                         GridwiseGemm::template Run<OffsettedBlockToCTileMap,
                                                    true,
                                                    InMemoryDataOperationEnum::Set,
-                                                   TailNumber::Three>(
+                                                   TailNumber::Full>(
                             static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
                             static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
                             p_ds_grid,
@@ -245,84 +205,166 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             cde_element_op,
                             b2c_tile_map);
                     }
-                }
 
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Four)
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
                     {
-                        GridwiseGemm::template Run<OffsettedBlockToCTileMap,
-                                                   true,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   TailNumber::Four>(
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                        {
+                            GridwiseGemm::template Run<OffsettedBlockToCTileMap,
+                                                       true,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       TailNumber::Two>(
+                                static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
+                                static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
+                                p_ds_grid,
+                                static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
+                                static_cast<void*>(p_shared),
+                                problem,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op,
+                                b2c_tile_map);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Three)
+                        {
+                            GridwiseGemm::template Run<OffsettedBlockToCTileMap,
+                                                       true,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       TailNumber::Three>(
+                                static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
+                                static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
+                                p_ds_grid,
+                                static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
+                                static_cast<void*>(p_shared),
+                                problem,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op,
+                                b2c_tile_map);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Four)
+                        {
+                            GridwiseGemm::template Run<OffsettedBlockToCTileMap,
+                                                       true,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       TailNumber::Four>(
+                                static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
+                                static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
+                                p_ds_grid,
+                                static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
+                                static_cast<void*>(p_shared),
+                                problem,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op,
+                                b2c_tile_map);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Five)
+                        {
+                            GridwiseGemm::template Run<OffsettedBlockToCTileMap,
+                                                       true,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       TailNumber::Five>(
+                                static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
+                                static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
+                                p_ds_grid,
+                                static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
+                                static_cast<void*>(p_shared),
+                                problem,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op,
+                                b2c_tile_map);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                        {
+                            GridwiseGemm::template Run<OffsettedBlockToCTileMap,
+                                                       true,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       TailNumber::Six>(
+                                static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
+                                static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
+                                p_ds_grid,
+                                static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
+                                static_cast<void*>(p_shared),
+                                problem,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op,
+                                b2c_tile_map);
+                        }
+                    }
+
+                    if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Seven)
+                        {
+                            GridwiseGemm::template Run<OffsettedBlockToCTileMap,
+                                                       true,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       TailNumber::Seven>(
+                                static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
+                                static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
+                                p_ds_grid,
+                                static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
+                                static_cast<void*>(p_shared),
+                                problem,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op,
+                                b2c_tile_map);
+                        }
+                    }
+                }
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        GridwiseGemm::template Run_2Lds<OffsettedBlockToCTileMap,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        TailNumber::Odd>(
                             static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
                             static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
                             p_ds_grid,
                             static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
                             static_cast<void*>(p_shared),
+                            static_cast<void*>(p_shared1),
                             problem,
                             a_element_op,
                             b_element_op,
                             cde_element_op,
                             b2c_tile_map);
                     }
-                }
-
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Five)
+                    else
                     {
-                        GridwiseGemm::template Run<OffsettedBlockToCTileMap,
-                                                   true,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   TailNumber::Five>(
-                            static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
-                            static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
-                            p_ds_grid,
-                            static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
-                            static_cast<void*>(p_shared),
-                            problem,
-                            a_element_op,
-                            b_element_op,
-                            cde_element_op,
-                            b2c_tile_map);
-                    }
-                }
-
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
-                    {
-                        GridwiseGemm::template Run<OffsettedBlockToCTileMap,
-                                                   true,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   TailNumber::Six>(
-                            static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
-                            static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
-                            p_ds_grid,
-                            static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
-                            static_cast<void*>(p_shared),
-                            problem,
-                            a_element_op,
-                            b_element_op,
-                            cde_element_op,
-                            b2c_tile_map);
-                    }
-                }
-
-                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Seven)
-                    {
-                        GridwiseGemm::template Run<OffsettedBlockToCTileMap,
-                                                   true,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   TailNumber::Seven>(
+                        GridwiseGemm::template Run_2Lds<OffsettedBlockToCTileMap,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        TailNumber::Even>(
                             static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
                             static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
                             p_ds_grid,
                             static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
                             static_cast<void*>(p_shared),
+                            static_cast<void*>(p_shared1),
                             problem,
                             a_element_op,
                             b_element_op,
@@ -331,39 +373,19 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                     }
                 }
             }
-            // Tail number could be Odd or Even
-            else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+            else
             {
-                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
                 {
-                    GridwiseGemm::template Run_2Lds<OffsettedBlockToCTileMap,
-                                                    true,
-                                                    InMemoryDataOperationEnum::Set,
-                                                    TailNumber::Odd>(
+                    GridwiseGemm::template Run<OffsettedBlockToCTileMap,
+                                               false,
+                                               InMemoryDataOperationEnum::Set,
+                                               TailNumber::Full>(
                         static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
                         static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
                         p_ds_grid,
                         static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
                         static_cast<void*>(p_shared),
-                        static_cast<void*>(p_shared1),
-                        problem,
-                        a_element_op,
-                        b_element_op,
-                        cde_element_op,
-                        b2c_tile_map);
-                }
-                else
-                {
-                    GridwiseGemm::template Run_2Lds<OffsettedBlockToCTileMap,
-                                                    true,
-                                                    InMemoryDataOperationEnum::Set,
-                                                    TailNumber::Even>(
-                        static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
-                        static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
-                        p_ds_grid,
-                        static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
-                        static_cast<void*>(p_shared),
-                        static_cast<void*>(p_shared1),
                         problem,
                         a_element_op,
                         b_element_op,
@@ -371,32 +393,12 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                         b2c_tile_map);
                 }
             }
-        }
-        else
-        {
-            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
-            {
-                GridwiseGemm::template Run<OffsettedBlockToCTileMap,
-                                           false,
-                                           InMemoryDataOperationEnum::Set,
-                                           TailNumber::Full>(
-                    static_cast<const ADataType*>(gemm_desc_ptr[group_id].p_a_grid),
-                    static_cast<const BDataType*>(gemm_desc_ptr[group_id].p_b_grid),
-                    p_ds_grid,
-                    static_cast<EDataType*>(gemm_desc_ptr[group_id].p_e_grid),
-                    static_cast<void*>(p_shared),
-                    problem,
-                    a_element_op,
-                    b_element_op,
-                    cde_element_op,
-                    b2c_tile_map);
-            }
-        }
 
-        tile_id += get_grid_size();
-        tile_offset += get_grid_size();
+            tile_id += get_grid_size();
+            tile_offset += get_grid_size();
 
-    } while(group_id < group_count);
+        } while(group_id < group_count);
+    }
 #else
     ignore = gemm_descs_const;
     ignore = group_count;
@@ -467,10 +469,14 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
                                        BElementwiseOperation,
                                        CDEElementwiseOperation>
 {
-    using DeviceOp                      = DeviceGroupedGemmMultipleDXdlCShuffleTileLoop;
+    using DeviceOp = DeviceGroupedGemmMultipleDXdlCShuffleTileLoop;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
 
-    using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultiD_xdl_cshuffle_v3<
         ALayout,
         BLayout,
         DsLayout,
@@ -494,7 +500,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -519,6 +525,8 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
         BlkGemmPipelineVer,
         ComputeTypeA,
         ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     using KernelArguments = GroupedGemmKernelArgument<NumDTensor>;
     using Block2ETileMap  = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
@@ -571,10 +579,15 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
         // The oversubscription factor for the number of blocks that can simultaneously reside on
         // GPU.
         static constexpr int BLOCK_SUBSCRIPTION_FACTOR = 1;
-        static constexpr int BLOCK_WAVES               = BlockSize / get_warp_size();
-        static constexpr int CU_SIMDS                  = 4;
+        // static constexpr int BLOCK_WAVES               = BlockSize / get_warp_size();
+        static constexpr int CU_SIMDS = 4;
         // Assume we want to have at most 2 waves per SIMD
-        static constexpr int CU_BLOCKS = math::integer_divide_floor(2 * CU_SIMDS, BLOCK_WAVES);
+        // static constexpr int CU_BLOCKS = math::integer_divide_floor(2 * CU_SIMDS, BLOCK_WAVES);
+        static int GetCuBlocks()
+        {
+            int BLOCK_WAVES = BlockSize / get_warp_size();
+            return math::integer_divide_floor(2 * CU_SIMDS, BLOCK_WAVES);
+        }
     };
 
     // Invoker
@@ -593,6 +606,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
         ///
         /// @return     The average kernel execution time (if time measurement is enabled.)
         ///
+        template <typename GridwiseGemm>
         float Run(const Argument& arg,
                   const void* dev_gemm_args,
                   const StreamConfig& stream_config = StreamConfig{})
@@ -606,7 +620,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
             }
 
             float ave_time = 0;
-            ave_time       = DispatchKernel(arg, dev_gemm_args, stream_config);
+            ave_time       = DispatchKernel<GridwiseGemm>(arg, dev_gemm_args, stream_config);
 
             return ave_time;
         }
@@ -624,7 +638,8 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
         ///
         /// @return     The average kernel execution time (if time measurement is enabled.)
         ///
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(arg.p_dev_gemm_args_ == nullptr)
             {
@@ -634,9 +649,11 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
                 throw std::runtime_error(err.str());
             }
 
-            return Run(arg, arg.p_dev_gemm_args_, stream_config);
+            return Run<GridwiseGemm>(arg, arg.p_dev_gemm_args_, stream_config);
         }
 
+        INVOKER_RUN_IMPL
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -644,6 +661,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
         }
 
         private:
+        template <typename GridwiseGemm>
         float DispatchKernel(const Argument& arg,
                              const void* dev_gemm_args,
                              const StreamConfig& stream_config) const
@@ -686,11 +704,11 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
             {
                 std::cout << "MaxActiveBlocksPerCU: " << occ_num_blocks
                           << ", available CUs count: " << cu_count << ", occup. grid size: "
-                          << ck::math::min(occ_num_blocks, KernelConfig::CU_BLOCKS) * cu_count
+                          << ck::math::min(occ_num_blocks, KernelConfig::GetCuBlocks()) * cu_count
                           << std::endl;
             }
 
-            return cu_count * ck::math::min(occ_num_blocks, KernelConfig::CU_BLOCKS);
+            return cu_count * ck::math::min(occ_num_blocks, KernelConfig::GetCuBlocks());
         }
 
         template <typename KernelFunction>
@@ -730,36 +748,19 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         bool supported = true;
 
         constexpr index_t k_batch = 1;
+        bool isWave64             = get_warp_size() == 64;
         for(index_t i = 0; i < arg.group_count_; ++i)
         {
             std::array<const void*, NumDTensor> placeholder_p_ds_grid{};
             std::array<index_t, NumDTensor> stride_Ds;
             std::copy_n(arg.gemm_descs_[i].stride_Ds_.begin(), NumDTensor, stride_Ds.begin());
-            using GridArg = typename GridwiseGemm::Argument;
-            GridArg gridwise_arg(nullptr,               // p_a_grid,
-                                 nullptr,               // p_b_grid,
-                                 placeholder_p_ds_grid, // p_ds_grid,
-                                 nullptr,               // p_e_grid  ,
-                                 arg.gemm_descs_[i].M_,
-                                 arg.gemm_descs_[i].N_,
-                                 arg.gemm_descs_[i].K_,
-                                 arg.gemm_descs_[i].stride_A_,
-                                 arg.gemm_descs_[i].stride_B_,
-                                 stride_Ds,
-                                 arg.gemm_descs_[i].stride_C_,
-                                 k_batch,
-                                 arg.a_element_op_,
-                                 arg.b_element_op_,
-                                 arg.cde_element_op_);
-
             if((arg.gemm_descs_[i].K_ % AK1 != 0 || arg.gemm_descs_[i].K_ % BK1 != 0) &&
                !(GemmSpec == GemmSpecialization::MKPadding ||
                  GemmSpec == GemmSpecialization::NKPadding ||
@@ -768,8 +769,62 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
             {
                 return false;
             }
+            if(isWave64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    using GridArg = typename GridwiseGemm64::Argument;
+                    GridArg gridwise_arg(nullptr,               // p_a_grid,
+                                         nullptr,               // p_b_grid,
+                                         placeholder_p_ds_grid, // p_ds_grid,
+                                         nullptr,               // p_e_grid  ,
+                                         arg.gemm_descs_[i].M_,
+                                         arg.gemm_descs_[i].N_,
+                                         arg.gemm_descs_[i].K_,
+                                         arg.gemm_descs_[i].stride_A_,
+                                         arg.gemm_descs_[i].stride_B_,
+                                         stride_Ds,
+                                         arg.gemm_descs_[i].stride_C_,
+                                         k_batch,
+                                         arg.a_element_op_,
+                                         arg.b_element_op_,
+                                         arg.cde_element_op_);
 
-            supported = supported && GridwiseGemm::CheckValidity(gridwise_arg);
+                    supported = supported && GridwiseGemm64::CheckValidity(gridwise_arg);
+                }
+                else
+                {
+                    supported = false;
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    using GridArg = typename GridwiseGemm32::Argument;
+                    GridArg gridwise_arg(nullptr,               // p_a_grid,
+                                         nullptr,               // p_b_grid,
+                                         placeholder_p_ds_grid, // p_ds_grid,
+                                         nullptr,               // p_e_grid  ,
+                                         arg.gemm_descs_[i].M_,
+                                         arg.gemm_descs_[i].N_,
+                                         arg.gemm_descs_[i].K_,
+                                         arg.gemm_descs_[i].stride_A_,
+                                         arg.gemm_descs_[i].stride_B_,
+                                         stride_Ds,
+                                         arg.gemm_descs_[i].stride_C_,
+                                         k_batch,
+                                         arg.a_element_op_,
+                                         arg.b_element_op_,
+                                         arg.cde_element_op_);
+
+                    supported = supported && GridwiseGemm32::CheckValidity(gridwise_arg);
+                }
+                else
+                {
+                    supported = false;
+                }
+            }
         }
 
         return supported;
@@ -780,6 +835,67 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
         return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
     }
 
+    static int GetKernelOccupancy()
+    {
+        int occupancy = 0;
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                const auto kernel = kernel_grouped_gemm_multiple_d_xdl<GridwiseGemm64,
+                                                                       KernelArguments,
+                                                                       GemmSpec,
+                                                                       ADataType,
+                                                                       BDataType,
+                                                                       DsDataType,
+                                                                       EDataType,
+                                                                       ALayout,
+                                                                       BLayout,
+                                                                       DsLayout,
+                                                                       ELayout,
+                                                                       KPerBlock,
+                                                                       OffsettedLocalBlock2ETileMap,
+                                                                       Block2ETileMap,
+                                                                       AElementwiseOperation,
+                                                                       BElementwiseOperation,
+                                                                       CDEElementwiseOperation,
+                                                                       BlkGemmPipeSched,
+                                                                       BlkGemmPipelineVer>;
+                hip_check_error(
+                    hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, BlockSize, 0));
+            }
+        }
+        else
+        {
+
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                const auto kernel = kernel_grouped_gemm_multiple_d_xdl<GridwiseGemm32,
+                                                                       KernelArguments,
+                                                                       GemmSpec,
+                                                                       ADataType,
+                                                                       BDataType,
+                                                                       DsDataType,
+                                                                       EDataType,
+                                                                       ALayout,
+                                                                       BLayout,
+                                                                       DsLayout,
+                                                                       ELayout,
+                                                                       KPerBlock,
+                                                                       OffsettedLocalBlock2ETileMap,
+                                                                       Block2ETileMap,
+                                                                       AElementwiseOperation,
+                                                                       BElementwiseOperation,
+                                                                       CDEElementwiseOperation,
+                                                                       BlkGemmPipeSched,
+                                                                       BlkGemmPipelineVer>;
+                hip_check_error(
+                    hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, BlockSize, 0));
+            }
+        }
+        return occupancy;
+    }
+
     static auto MakeArgument(std::vector<const void*>& p_As,
                              std::vector<const void*>& p_Bs,
                              std::vector<std::array<const void*, NumDTensor>>& p_Ds,
@@ -789,28 +905,8 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
                              BElementwiseOperation b_elementwise_op,
                              CDEElementwiseOperation cde_elementwise_op)
     {
-        const auto kernel = kernel_grouped_gemm_multiple_d_xdl<GridwiseGemm,
-                                                               KernelArguments,
-                                                               GemmSpec,
-                                                               ADataType,
-                                                               BDataType,
-                                                               DsDataType,
-                                                               EDataType,
-                                                               ALayout,
-                                                               BLayout,
-                                                               DsLayout,
-                                                               ELayout,
-                                                               KPerBlock,
-                                                               OffsettedLocalBlock2ETileMap,
-                                                               Block2ETileMap,
-                                                               AElementwiseOperation,
-                                                               BElementwiseOperation,
-                                                               CDEElementwiseOperation,
-                                                               BlkGemmPipeSched,
-                                                               BlkGemmPipelineVer>;
-        int occupancy, num_cu;
-        hip_check_error(
-            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, BlockSize, 0));
+        int occupancy = GetKernelOccupancy();
+        int num_cu;
 
         hipDeviceProp_t dev_prop;
         hipDevice_t dev;
@@ -840,28 +936,8 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
                         BElementwiseOperation b_elementwise_op,
                         CDEElementwiseOperation cde_elementwise_op) override
     {
-        const auto kernel = kernel_grouped_gemm_multiple_d_xdl<GridwiseGemm,
-                                                               KernelArguments,
-                                                               GemmSpec,
-                                                               ADataType,
-                                                               BDataType,
-                                                               DsDataType,
-                                                               EDataType,
-                                                               ALayout,
-                                                               BLayout,
-                                                               DsLayout,
-                                                               ELayout,
-                                                               KPerBlock,
-                                                               OffsettedLocalBlock2ETileMap,
-                                                               Block2ETileMap,
-                                                               AElementwiseOperation,
-                                                               BElementwiseOperation,
-                                                               CDEElementwiseOperation,
-                                                               BlkGemmPipeSched,
-                                                               BlkGemmPipelineVer>;
-        int occupancy, num_cu;
-        hip_check_error(
-            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, BlockSize, 0));
+        int occupancy = GetKernelOccupancy();
+        int num_cu;
 
         hipDeviceProp_t dev_prop;
         hipDevice_t dev;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 784b2fd401..62dcbfb83b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -43,63 +43,70 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const B1ElementwiseOperation b1_element_op,
         const CElementwiseOperation c_element_op)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    const index_t block_id = get_block_1d_id();
-
-    const auto arg_ptr = reinterpret_cast<const GroupKernelArg*>(
-        cast_pointer_to_generic_address_space(group_kernel_args));
-
-    index_t left     = 0;
-    index_t right    = group_count;
-    index_t group_id = index_t((left + right) / 2);
-
-    while(
-        (!(block_id >= arg_ptr[group_id].block_start_ && block_id < arg_ptr[group_id].block_end_)))
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
     {
-        if(block_id < arg_ptr[group_id].block_start_)
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+        const index_t block_id = get_block_1d_id();
+
+        const auto arg_ptr = reinterpret_cast<const GroupKernelArg*>(
+            cast_pointer_to_generic_address_space(group_kernel_args));
+
+        index_t left     = 0;
+        index_t right    = group_count;
+        index_t group_id = index_t((left + right) / 2);
+
+        while((!(block_id >= arg_ptr[group_id].block_start_ &&
+                 block_id < arg_ptr[group_id].block_end_)))
         {
-            right = group_id;
+            if(block_id < arg_ptr[group_id].block_start_)
+            {
+                right = group_id;
+            }
+            else
+            {
+                left = group_id;
+            }
+            group_id = index_t((left + right) / 2);
         }
-        else
-        {
-            left = group_id;
-        }
-        group_id = index_t((left + right) / 2);
+
+        // per-group batch offset
+        const index_t num_blocks_per_batch = arg_ptr[group_id].num_blocks_per_batch_;
+        const index_t g_idx                = __builtin_amdgcn_readfirstlane(
+            (block_id - arg_ptr[group_id].block_start_) / num_blocks_per_batch);
+
+        const long_index_t a_batch_offset =
+            __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(
+                arg_ptr[group_id].compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
+        const long_index_t b_batch_offset =
+            __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(
+                arg_ptr[group_id].compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
+        const long_index_t b1_batch_offset =
+            __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(
+                arg_ptr[group_id].compute_base_ptr_of_batch_.GetB1BasePtr(g_idx)));
+        const long_index_t c_batch_offset =
+            __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(
+                arg_ptr[group_id].compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
+
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            arg_ptr[group_id].p_a_grid_ + a_batch_offset,
+            arg_ptr[group_id].p_b_grid_ + b_batch_offset,
+            arg_ptr[group_id].p_b1_grid_ + b1_batch_offset,
+            arg_ptr[group_id].p_c_grid_ + c_batch_offset,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            acc_element_op,
+            b1_element_op,
+            c_element_op,
+            arg_ptr[group_id].a_grid_desc_ak0_m_ak1_,
+            arg_ptr[group_id].b_grid_desc_bk0_n_bk1_,
+            arg_ptr[group_id].b1_grid_desc_bk0_n_bk1_,
+            arg_ptr[group_id].c_grid_desc_mblock_mperblock_nblock_nperblock_,
+            arg_ptr[group_id].block_2_ctile_map_,
+            arg_ptr[group_id].c0_matrix_mask_);
     }
-
-    // per-group batch offset
-    const index_t num_blocks_per_batch = arg_ptr[group_id].num_blocks_per_batch_;
-    const index_t g_idx                = __builtin_amdgcn_readfirstlane(
-        (block_id - arg_ptr[group_id].block_start_) / num_blocks_per_batch);
-
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(arg_ptr[group_id].compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(arg_ptr[group_id].compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
-    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(
-        arg_ptr[group_id].compute_base_ptr_of_batch_.GetB1BasePtr(g_idx)));
-    const long_index_t c_batch_offset  = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(arg_ptr[group_id].compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
-
-    GridwiseGemm::template Run<HasMainKBlockLoop>(
-        arg_ptr[group_id].p_a_grid_ + a_batch_offset,
-        arg_ptr[group_id].p_b_grid_ + b_batch_offset,
-        arg_ptr[group_id].p_b1_grid_ + b1_batch_offset,
-        arg_ptr[group_id].p_c_grid_ + c_batch_offset,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        acc_element_op,
-        b1_element_op,
-        c_element_op,
-        arg_ptr[group_id].a_grid_desc_ak0_m_ak1_,
-        arg_ptr[group_id].b_grid_desc_bk0_n_bk1_,
-        arg_ptr[group_id].b1_grid_desc_bk0_n_bk1_,
-        arg_ptr[group_id].c_grid_desc_mblock_mperblock_nblock_nperblock_,
-        arg_ptr[group_id].block_2_ctile_map_,
-        arg_ptr[group_id].c0_matrix_mask_);
 #else
     ignore = group_kernel_args;
     ignore = group_count;
@@ -198,6 +205,11 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
                                                  CElementwiseOperation,
                                                  MaskingSpec>
 {
+    static constexpr auto MXdlPerWave64 =
+        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
+    static constexpr auto MXdlPerWave32 =
+        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
+
     static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0,
                   "Number of dimension must be greater than 0");
 
@@ -338,7 +350,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
     };
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+    template <index_t MXdlPerWave_>
+    using GridwiseGemmBase = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -365,7 +378,7 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
         B1K1,
         MPerXDL,
         NPerXDL,
-        MXdlPerWave,
+        MXdlPerWave_,
         NXdlPerWave,
         Gemm1NXdlPerWave,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
@@ -399,8 +412,10 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
         LoopSched,
         Transform::matrix_padder.PadN,
         MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(MXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<MXdlPerWave32>;
 
-    using Block2CTileMap = OffsettedBlockToCTileMap<typename GridwiseGemm::DefaultBlock2CTileMap>;
+    using Block2CTileMap = OffsettedBlockToCTileMap<typename GridwiseGemm64::DefaultBlock2CTileMap>;
 
     struct GroupKernelArg
     {
@@ -414,7 +429,7 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
         AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
         B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        typename GridwiseGemm64::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
             c_grid_desc_mblock_mperblock_nblock_nperblock_;
 
         // batch & stride
@@ -511,7 +526,7 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
                     problem_desc.c_gs_ms_os_lengths, problem_desc.c_gs_ms_os_strides);
 
                 const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    GridwiseGemm64::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                         c_grid_desc_m_n);
 
                 const index_t BlockStart     = grid_size_;
@@ -592,7 +607,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!DeviceOp::IsSupportedArgument(arg))
             {
@@ -664,6 +680,25 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
             return ave_time;
         }
 
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(MXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64>(arg, stream_config);
+                }
+            }
+            else
+            {
+                if constexpr(MXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32>(arg, stream_config);
+                }
+            }
+            return 0;
+        }
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -680,11 +715,10 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         // TODO ANT: Check if tensor specialization & strides mismatch
 
         bool all_has_main_k_block_loop  = true;
@@ -708,7 +742,7 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
             // Check if having main loop
             const auto K = kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) *
                            kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
-            const bool y = GridwiseGemm::CalculateHasMainKBlockLoop(K);
+            const bool y = GridwiseGemm64::CalculateHasMainKBlockLoop(K);
             all_has_main_k_block_loop &= y;
             some_has_main_k_block_loop |= y;
 
@@ -753,14 +787,31 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
                 return false;
             }
 
-            if(!GridwiseGemm::CheckValidity(kernel_arg.a_grid_desc_ak0_m_ak1_,
-                                            kernel_arg.b_grid_desc_bk0_n_bk1_,
-                                            kernel_arg.b1_grid_desc_bk0_n_bk1_,
-                                            device_arg.c_grid_desc_m_n_,
-                                            kernel_arg.block_2_ctile_map_))
+            bool valid = false;
+            if(get_warp_size() == 64)
             {
-                return false;
+                if constexpr(MXdlPerWave64 > 0)
+                {
+                    valid = GridwiseGemm64::CheckValidity(kernel_arg.a_grid_desc_ak0_m_ak1_,
+                                                          kernel_arg.b_grid_desc_bk0_n_bk1_,
+                                                          kernel_arg.b1_grid_desc_bk0_n_bk1_,
+                                                          device_arg.c_grid_desc_m_n_,
+                                                          kernel_arg.block_2_ctile_map_);
+                }
             }
+            else
+            {
+                if constexpr(MXdlPerWave32 > 0)
+                {
+                    valid = GridwiseGemm32::CheckValidity(kernel_arg.a_grid_desc_ak0_m_ak1_,
+                                                          kernel_arg.b_grid_desc_bk0_n_bk1_,
+                                                          kernel_arg.b1_grid_desc_bk0_n_bk1_,
+                                                          device_arg.c_grid_desc_m_n_,
+                                                          kernel_arg.block_2_ctile_map_);
+                }
+            }
+            if(!valid)
+                return false;
         }
 
         // all gemm problems have to simultaneously meet has_main_k_block_loop or
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index 2c5d1dd134..7a1944cc68 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -39,46 +39,49 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             const BElementwiseOperation b_element_op,
                             const CDEElementwiseOperation c_element_op)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    const index_t block_id = get_block_1d_id();
-
-    const auto gemm_desc_ptr =
-        reinterpret_cast<const GemmDesc*>(cast_pointer_to_generic_address_space(gemm_descs_const));
-
-    index_t left     = 0;
-    index_t right    = group_count;
-    index_t group_id = index_t((left + right) / 2);
-    while((!(block_id >= gemm_desc_ptr[group_id].BlockStart_ &&
-             block_id < gemm_desc_ptr[group_id].BlockEnd_)) &&
-          left <= right)
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
     {
-        if(block_id < gemm_desc_ptr[group_id].BlockStart_)
-        {
-            right = group_id;
-        }
-        else
-        {
-            left = group_id;
-        }
-        group_id = index_t((left + right) / 2);
-    }
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
-        gemm_desc_ptr[group_id].a_ptr_,
-        gemm_desc_ptr[group_id].b_ptr_,
-        gemm_desc_ptr[group_id].ds_ptr_,
-        gemm_desc_ptr[group_id].e_ptr_,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        c_element_op,
-        gemm_desc_ptr[group_id].a_grid_desc_ak0_m_ak1_,
-        gemm_desc_ptr[group_id].b_grid_desc_bk0_n_bk1_,
-        gemm_desc_ptr[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-        gemm_desc_ptr[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
-        gemm_desc_ptr[group_id].block_2_etile_map_);
+        const index_t block_id = get_block_1d_id();
+
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmDesc*>(
+            cast_pointer_to_generic_address_space(gemm_descs_const));
+
+        index_t left     = 0;
+        index_t right    = group_count;
+        index_t group_id = index_t((left + right) / 2);
+        while((!(block_id >= gemm_desc_ptr[group_id].BlockStart_ &&
+                 block_id < gemm_desc_ptr[group_id].BlockEnd_)) &&
+              left <= right)
+        {
+            if(block_id < gemm_desc_ptr[group_id].BlockStart_)
+            {
+                right = group_id;
+            }
+            else
+            {
+                left = group_id;
+            }
+            group_id = index_t((left + right) / 2);
+        }
+
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+            gemm_desc_ptr[group_id].a_ptr_,
+            gemm_desc_ptr[group_id].b_ptr_,
+            gemm_desc_ptr[group_id].ds_ptr_,
+            gemm_desc_ptr[group_id].e_ptr_,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            c_element_op,
+            gemm_desc_ptr[group_id].a_grid_desc_ak0_m_ak1_,
+            gemm_desc_ptr[group_id].b_grid_desc_bk0_n_bk1_,
+            gemm_desc_ptr[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+            gemm_desc_ptr[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            gemm_desc_ptr[group_id].block_2_etile_map_);
+    }
 #else
     ignore = gemm_descs_const;
     ignore = group_count;
@@ -145,7 +148,9 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                                                         CDEElementwiseOperation>
 {
     using DeviceOp = DeviceGroupedGemm_Xdl;
-
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     static constexpr auto I0 = Number<0>{};
@@ -231,7 +236,8 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
     using ComputeDataType = ADataType;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle<
         ADataType, // TODO: distinguish A/B datatype
         BDataType,
         ComputeDataType,
@@ -252,7 +258,7 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -274,34 +280,36 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     using AGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(
             AGridDesc_M_K{}))>;
     using BGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemm64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm64::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     struct GroupedGemmBlock2ETileMap
     {
         using Block2ETileMap =
-            remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+            remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
         GroupedGemmBlock2ETileMap()
         {
-            block_2_etile_map_ = GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{});
+            block_2_etile_map_ = GridwiseGemm64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{});
             BlockStart_        = -1;
         }
 
         GroupedGemmBlock2ETileMap(const EGridDesc_M_N& e_grid_desc_m_n, ck::index_t BlockStart)
         {
-            block_2_etile_map_ = GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+            block_2_etile_map_ = GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
             BlockStart_        = BlockStart;
         }
 
@@ -334,7 +342,7 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
         // pointers
         const ADataType* a_ptr_;
         const BDataType* b_ptr_;
-        typename GridwiseGemm::DsGridPointer ds_ptr_;
+        typename GridwiseGemm64::DsGridPointer ds_ptr_;
         EDataType* e_ptr_;
 
         // tensor descriptors for problem definiton
@@ -358,6 +366,64 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
     // Argument
     struct Argument : public BaseArgument
     {
+        template <typename GridwiseGemm, typename DsPointer, typename Block2ETileMap>
+        void init_gridwise_gemm_desc(const ADataType* a_ptr,
+                                     const BDataType* b_ptr,
+                                     DsPointer ds_ptr,
+                                     EDataType* e_ptr,
+                                     const AGridDesc_M_K& a_grid_desc_m_k,
+                                     const BGridDesc_N_K& b_grid_desc_n_k,
+                                     const DsGridDesc_M_N& ds_grid_desc_m_n,
+                                     const EGridDesc_M_N& e_grid_desc_m_n,
+                                     const Block2ETileMap& block_2_etile_map,
+                                     index_t BlockStart,
+                                     index_t BlockEnd)
+        {
+            // tensor descriptors for block/thread-wise copy
+            const auto a_grid_desc_ak0_m_ak1 =
+                GridwiseGemm64::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k);
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                GridwiseGemm64::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k);
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                           b_grid_desc_n_k,
+                                           ds_grid_desc_m_n,
+                                           e_grid_desc_m_n,
+                                           block_2_etile_map))
+            {
+                // tensor descriptors for block/thread-wise copy
+                DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock;
+
+                static_for<0, NumDTensor, 1>{}([&](auto j) {
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock(j) =
+                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            ds_grid_desc_m_n[j]);
+                });
+
+                const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n);
+
+                gemm_desc_kernel_arg_.push_back(
+                    GemmBiasTransKernelArg{a_ptr,
+                                           b_ptr,
+                                           ds_ptr,
+                                           e_ptr,
+                                           a_grid_desc_m_k,
+                                           b_grid_desc_n_k,
+                                           ds_grid_desc_m_n,
+                                           e_grid_desc_m_n,
+                                           a_grid_desc_ak0_m_ak1,
+                                           b_grid_desc_bk0_n_bk1,
+                                           ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                           e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                           block_2_etile_map,
+                                           BlockStart,
+                                           BlockEnd});
+            }
+        };
         Argument(std::vector<const void*>& p_As,
                  std::vector<const void*>& p_Bs,
                  std::vector<std::array<const void*, NumDTensor>>& p_Ds,
@@ -403,7 +469,7 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                 const index_t StrideC = gemm_descs[i].stride_C_;
 
                 // pointer
-                typename GridwiseGemm::DsGridPointer p_ds_grid{};
+                typename GridwiseGemm64::DsGridPointer p_ds_grid{};
 
                 static_for<0, NumDTensor, 1>{}([&](auto j) {
                     using DDataType = remove_cvref_t<tuple_element_t<j.value, DsDataType>>;
@@ -427,13 +493,6 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                 const auto e_grid_desc_m_n =
                     DeviceOp::MakeEGridDescriptor_M_N<ELayout>(M, N, StrideC);
 
-                // tensor descriptors for block/thread-wise copy
-                const auto a_grid_desc_ak0_m_ak1 =
-                    GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k);
-
-                const auto b_grid_desc_bk0_n_bk1 =
-                    GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k);
-
                 const index_t grid_size_grp =
                     GroupedGemmBlock2ETileMap(e_grid_desc_m_n, 0)
                         .block_2_etile_map_.CalculateGridSize(e_grid_desc_m_n);
@@ -447,42 +506,41 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                 const auto block_2_etile_map =
                     GroupedGemmBlock2ETileMap(e_grid_desc_m_n, BlockStart);
 
-                if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                               b_grid_desc_n_k,
-                                               ds_grid_desc_m_n,
-                                               e_grid_desc_m_n,
-                                               block_2_etile_map))
+                if(get_warp_size() == 64)
                 {
-                    // tensor descriptors for block/thread-wise copy
-                    DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                        ds_grid_desc_mblock_mperblock_nblock_nperblock;
-
-                    static_for<0, NumDTensor, 1>{}([&](auto j) {
-                        ds_grid_desc_mblock_mperblock_nblock_nperblock(j) =
-                            GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                ds_grid_desc_m_n[j]);
-                    });
-
-                    const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
-                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            e_grid_desc_m_n);
-
-                    gemm_desc_kernel_arg_.push_back(
-                        GemmBiasTransKernelArg{static_cast<const ADataType*>(p_As[i]),
-                                               static_cast<const BDataType*>(p_Bs[i]),
-                                               p_ds_grid,
-                                               static_cast<EDataType*>(p_Es[i]),
-                                               a_grid_desc_m_k,
-                                               b_grid_desc_n_k,
-                                               ds_grid_desc_m_n,
-                                               e_grid_desc_m_n,
-                                               a_grid_desc_ak0_m_ak1,
-                                               b_grid_desc_bk0_n_bk1,
-                                               ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                               e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                               block_2_etile_map,
-                                               BlockStart,
-                                               BlockEnd});
+                    if constexpr(NXdlPerWave64 > 0)
+                    {
+                        init_gridwise_gemm_desc<GridwiseGemm64>(
+                            static_cast<const ADataType*>(p_As[i]),
+                            static_cast<const BDataType*>(p_Bs[i]),
+                            p_ds_grid,
+                            static_cast<EDataType*>(p_Es[i]),
+                            a_grid_desc_m_k,
+                            b_grid_desc_n_k,
+                            ds_grid_desc_m_n,
+                            e_grid_desc_m_n,
+                            block_2_etile_map,
+                            BlockStart,
+                            BlockEnd);
+                    }
+                }
+                else
+                {
+                    if constexpr(NXdlPerWave32 > 0)
+                    {
+                        init_gridwise_gemm_desc<GridwiseGemm32>(
+                            static_cast<const ADataType*>(p_As[i]),
+                            static_cast<const BDataType*>(p_Bs[i]),
+                            p_ds_grid,
+                            static_cast<EDataType*>(p_Es[i]),
+                            a_grid_desc_m_k,
+                            b_grid_desc_n_k,
+                            ds_grid_desc_m_n,
+                            e_grid_desc_m_n,
+                            block_2_etile_map,
+                            BlockStart,
+                            BlockEnd);
+                    }
                 }
             }
         }
@@ -508,10 +566,11 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg,
-                  const StreamConfig& stream_config = StreamConfig{},
-                  hipStream_t cpy_stream            = nullptr,
-                  hipEvent_t cpy_event              = nullptr)
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{},
+                     hipStream_t cpy_stream            = nullptr,
+                     hipEvent_t cpy_event              = nullptr)
         {
             bool has_main_k_block_loop = true;
 
@@ -626,6 +685,28 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
             return ave_time;
         }
 
+        float Run(const Argument& arg,
+                  const StreamConfig& stream_config = StreamConfig{},
+                  hipStream_t cpy_stream            = nullptr,
+                  hipEvent_t cpy_event              = nullptr)
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64>(arg, stream_config, cpy_stream, cpy_event);
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32>(arg, stream_config, cpy_stream, cpy_event);
+                }
+            }
+            return 0;
+        }
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -636,11 +717,10 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if((ck::type_convert<ck::index_t>(arg.gemm_desc_kernel_arg_.size()) +
             arg.skipped_group_count_) != arg.group_count_)
         {
@@ -649,12 +729,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
 
         bool supported = true;
 
-        // If we use padding we do not support vector loads for dimensions not divisible by vector
-        // load size.
+        // If we use padding we do not support vector loads for dimensions not divisible by
+        // vector load size.
         if constexpr(GemmSpec != GemmSpecialization::Default)
         {
-            // [A|B]BlockTransferSrcVectorDim value define dimension in the block {K0,M,K1} layout,
-            // thus we have to adapt it to the {M,K} or {N,K} layout.
+            // [A|B]BlockTransferSrcVectorDim value define dimension in the block {K0,M,K1}
+            // layout, thus we have to adapt it to the {M,K} or {N,K} layout.
             const auto a_raw_vector_dim = ABlockTransferSrcVectorDim != 1 ? 1 : 0;
             const auto b_raw_vector_dim = BBlockTransferSrcVectorDim != 1 ? 1 : 0;
 
@@ -767,7 +847,8 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
     size_t GetHostKernelArgSize(const BaseArgument* p_arg) const { return GetWorkSpaceSize(p_arg); }
 
     //----------------------------------------------------------------------------------------------
-    /// @brief      Sets the host kernel arguments pointer and copies that data on the host side.
+    /// @brief      Sets the host kernel arguments pointer and copies that data on the host
+    /// side.
     ///             This function can be utilised to use pinned memory for the host args and
     ///             achieve fully async data copy.
     ///
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
index 91c691b6a2..f2bc00cca5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -50,122 +50,125 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                      const BElementwiseOperation b_element_op,
                                      const CDEElementwiseOperation c_element_op)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    const index_t block_id = get_block_1d_id();
-
-    const auto gemm_desc_ptr =
-        reinterpret_cast<const GemmDesc*>(cast_pointer_to_generic_address_space(gemm_descs_const));
-
-    const index_t group_id = block_id / grid_size_grp;
-
-    if(group_id >= group_count)
-        return;
-
-    const index_t M = gemm_desc_ptr[group_id].M;
-    const index_t N = gemm_desc_ptr[group_id].N;
-    const index_t K = gemm_desc_ptr[group_id].K;
-
-    if(M == 0 || N == 0 || K == 0)
-        return;
-
-    const auto StrideA  = gemm_desc_ptr[group_id].StrideA;
-    const auto StrideB  = gemm_desc_ptr[group_id].StrideB;
-    const auto StrideDs = gemm_desc_ptr[group_id].StrideDs;
-    const auto StrideE  = gemm_desc_ptr[group_id].StrideE;
-
-    const auto e_grid_desc_m_n =
-        GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(M, N, StrideE);
-
-    const index_t BlockStart = group_id * grid_size_grp;
-
-    const auto local_b2e_tile_map = Block2ETileMap{e_grid_desc_m_n, KBatch};
-
-    const auto local_grid_size = local_b2e_tile_map.CalculateGridSize(e_grid_desc_m_n);
-
-    constexpr auto NumDTensor = DsDataType::Size();
-
-    using DsGridPointer = decltype(GridwiseGemm::MakeDsGridPointer());
-
-    DsGridPointer p_ds_grid_;
-
-    static_for<0, NumDTensor, 1>{}([&](auto i) {
-        using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
-        // D pointer
-        p_ds_grid_(i) = static_cast<const DDataType*>(gemm_desc_ptr[group_id].p_ds_grid[i]);
-    });
-
-    index_t id_off   = 0;
-    index_t id_local = get_block_1d_id() - BlockStart;
-
-    const index_t mn_blocks = local_grid_size / KBatch;
-
-    while(id_local < local_grid_size)
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<EGlobalMemoryDataOperation>())
     {
-        const auto block_2_etile_map =
-            GroupedGemmBlock2ETileMap(local_b2e_tile_map, BlockStart, id_off);
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-        if constexpr(Zeroing)
-        {
-            auto barrier_count_finished =
-                barrier_count + group_id * barrier_size_grp + id_local % mn_blocks;
-            GridwiseGemm::template RunWithZeroing<HasMainKBlockLoop,
-                                                  EGlobalMemoryDataOperation,
-                                                  GemmSpec,
-                                                  ALayout,
-                                                  BLayout,
-                                                  DsLayout,
-                                                  ELayout>(gemm_desc_ptr[group_id].p_a_grid,
-                                                           gemm_desc_ptr[group_id].p_b_grid,
-                                                           p_ds_grid_,
-                                                           gemm_desc_ptr[group_id].p_e_grid,
-                                                           p_shared,
-                                                           barrier_count_finished,
-                                                           a_element_op,
-                                                           b_element_op,
-                                                           c_element_op,
-                                                           M,
-                                                           N,
-                                                           K,
-                                                           StrideA,
-                                                           StrideB,
-                                                           StrideDs,
-                                                           StrideE,
-                                                           KBatch,
-                                                           block_2_etile_map);
-        }
-        else
+        const index_t block_id = get_block_1d_id();
+
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmDesc*>(
+            cast_pointer_to_generic_address_space(gemm_descs_const));
+
+        const index_t group_id = block_id / grid_size_grp;
+
+        if(group_id >= group_count)
+            return;
+
+        const index_t M = gemm_desc_ptr[group_id].M;
+        const index_t N = gemm_desc_ptr[group_id].N;
+        const index_t K = gemm_desc_ptr[group_id].K;
+
+        if(M == 0 || N == 0 || K == 0)
+            return;
+
+        const auto StrideA  = gemm_desc_ptr[group_id].StrideA;
+        const auto StrideB  = gemm_desc_ptr[group_id].StrideB;
+        const auto StrideDs = gemm_desc_ptr[group_id].StrideDs;
+        const auto StrideE  = gemm_desc_ptr[group_id].StrideE;
+
+        const auto e_grid_desc_m_n =
+            GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(M, N, StrideE);
+
+        const index_t BlockStart = group_id * grid_size_grp;
+
+        const auto local_b2e_tile_map = Block2ETileMap{e_grid_desc_m_n, KBatch};
+
+        const auto local_grid_size = local_b2e_tile_map.CalculateGridSize(e_grid_desc_m_n);
+
+        constexpr auto NumDTensor = DsDataType::Size();
+
+        using DsGridPointer = decltype(GridwiseGemm::MakeDsGridPointer());
+
+        DsGridPointer p_ds_grid_;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+            // D pointer
+            p_ds_grid_(i) = static_cast<const DDataType*>(gemm_desc_ptr[group_id].p_ds_grid[i]);
+        });
+
+        index_t id_off   = 0;
+        index_t id_local = get_block_1d_id() - BlockStart;
+
+        const index_t mn_blocks = local_grid_size / KBatch;
+
+        while(id_local < local_grid_size)
         {
+            const auto block_2_etile_map =
+                GroupedGemmBlock2ETileMap(local_b2e_tile_map, BlockStart, id_off);
 
-            GridwiseGemm::template Run<HasMainKBlockLoop,
-                                       EGlobalMemoryDataOperation,
-                                       GemmSpec,
-                                       ALayout,
-                                       BLayout,
-                                       DsLayout,
-                                       ELayout>(gemm_desc_ptr[group_id].p_a_grid,
-                                                gemm_desc_ptr[group_id].p_b_grid,
-                                                p_ds_grid_,
-                                                gemm_desc_ptr[group_id].p_e_grid,
-                                                p_shared,
-                                                nullptr,
-                                                a_element_op,
-                                                b_element_op,
-                                                c_element_op,
-                                                M,
-                                                N,
-                                                K,
-                                                StrideA,
-                                                StrideB,
-                                                StrideDs,
-                                                StrideE,
-                                                KBatch,
-                                                block_2_etile_map);
-        }
+            if constexpr(Zeroing)
+            {
+                auto barrier_count_finished =
+                    barrier_count + group_id * barrier_size_grp + id_local % mn_blocks;
+                GridwiseGemm::template RunWithZeroing<HasMainKBlockLoop,
+                                                      EGlobalMemoryDataOperation,
+                                                      GemmSpec,
+                                                      ALayout,
+                                                      BLayout,
+                                                      DsLayout,
+                                                      ELayout>(gemm_desc_ptr[group_id].p_a_grid,
+                                                               gemm_desc_ptr[group_id].p_b_grid,
+                                                               p_ds_grid_,
+                                                               gemm_desc_ptr[group_id].p_e_grid,
+                                                               p_shared,
+                                                               barrier_count_finished,
+                                                               a_element_op,
+                                                               b_element_op,
+                                                               c_element_op,
+                                                               M,
+                                                               N,
+                                                               K,
+                                                               StrideA,
+                                                               StrideB,
+                                                               StrideDs,
+                                                               StrideE,
+                                                               KBatch,
+                                                               block_2_etile_map);
+            }
+            else
+            {
 
-        id_off += grid_size_grp;
-        id_local += grid_size_grp;
+                GridwiseGemm::template Run<HasMainKBlockLoop,
+                                           EGlobalMemoryDataOperation,
+                                           GemmSpec,
+                                           ALayout,
+                                           BLayout,
+                                           DsLayout,
+                                           ELayout>(gemm_desc_ptr[group_id].p_a_grid,
+                                                    gemm_desc_ptr[group_id].p_b_grid,
+                                                    p_ds_grid_,
+                                                    gemm_desc_ptr[group_id].p_e_grid,
+                                                    p_shared,
+                                                    nullptr,
+                                                    a_element_op,
+                                                    b_element_op,
+                                                    c_element_op,
+                                                    M,
+                                                    N,
+                                                    K,
+                                                    StrideA,
+                                                    StrideB,
+                                                    StrideDs,
+                                                    StrideE,
+                                                    KBatch,
+                                                    block_2_etile_map);
+            }
+
+            id_off += grid_size_grp;
+            id_local += grid_size_grp;
+        }
     }
 #else
     ignore = gemm_descs_const;
@@ -241,6 +244,9 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
                                                                         CDEElementwiseOperation>
 {
     using DeviceOp = DeviceGroupedGemm_Xdl_Fixed_NK;
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
 
     static constexpr index_t NumDTensor = DsDataType::Size();
 
@@ -252,7 +258,8 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
     using BComputeType = ComputeType;
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_splitk_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_splitk_cshuffle<
         ADataType, // TODO: distinguish A/B datatype
         BDataType,
         AComputeType,
@@ -274,7 +281,7 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -299,6 +306,8 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
         PipelineVer,
         ALDSType,
         BLDSType>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     template <typename UnderlyingBlockToCTileMap>
     struct OffsettedBlockToCTileMapMLoops
@@ -479,7 +488,7 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
             const index_t N       = gemm_desc_kernel_arg_[0].N_;
 
             const auto e_grid_desc_m_n =
-                GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
+                GridwiseGemm64::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
                     AverM, N, StrideE);
 
             const auto local_b2c_tile_map = Block2ETileMap{e_grid_desc_m_n, k_batch_};
@@ -550,7 +559,7 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
                 });
 
                 const auto e_grid_desc_m_n =
-                    GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
+                    GridwiseGemm64::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
                         AverM, N, StrideE);
 
                 // block-to-e-tile map
@@ -566,17 +575,55 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
                 grid_size_ += grid_size_grp_;
 
                 // check block-to-E-tile
+
                 if(!local_b2c_tile_map.CheckValidity(e_grid_desc_m_n))
                 {
                     throw std::runtime_error("wrong! block_2_etile_map validation failed");
                 }
 
-                if(!GridwiseGemm::
-                       template CheckValidity<ALayout, BLayout, DsLayout, ELayout, GemmSpec>(
-                           AverM, N, K, StrideA, StrideB, StrideDs, StrideE, 1))
+                if(get_warp_size() == 64)
                 {
-                    throw std::runtime_error(
-                        "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+                    if constexpr(NXdlPerWave64 > 0)
+                    {
+                        if(!GridwiseGemm64::template CheckValidity<ALayout,
+                                                                   BLayout,
+                                                                   DsLayout,
+                                                                   ELayout,
+                                                                   GemmSpec>(
+                               AverM, N, K, StrideA, StrideB, StrideDs, StrideE, 1))
+                        {
+                            throw std::runtime_error(
+                                "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid "
+                                "setting");
+                        }
+                    }
+                    else
+                    {
+                        throw std::runtime_error(
+                            "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+                    }
+                }
+                else
+                {
+                    if constexpr(NXdlPerWave32 > 0)
+                    {
+                        if(!GridwiseGemm32::template CheckValidity<ALayout,
+                                                                   BLayout,
+                                                                   DsLayout,
+                                                                   ELayout,
+                                                                   GemmSpec>(
+                               AverM, N, K, StrideA, StrideB, StrideDs, StrideE, 1))
+                        {
+                            throw std::runtime_error(
+                                "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid "
+                                "setting");
+                        }
+                    }
+                    else
+                    {
+                        throw std::runtime_error(
+                            "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+                    }
                 }
 
                 gemm_desc_kernel_arg_.push_back(GemmBiasTransKernelArg{
@@ -597,7 +644,7 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
             }
 
             const auto e_grid_desc_sum_m_n =
-                GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
+                GridwiseGemm64::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
                     sum_of_m, gemm_desc_kernel_arg_[0].N_, gemm_desc_kernel_arg_[0].StrideE_);
 
             const auto local_b2c_tile_map = Block2ETileMap{e_grid_desc_sum_m_n, 1};
@@ -631,7 +678,8 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             bool has_main_k_block_loop = true;
 
@@ -783,6 +831,8 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 45d46de74b..38ce211eda 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -41,39 +41,42 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                    const BElementwiseOperation b_element_op,
                                    const CElementwiseOperation c_element_op)
 {
-#if defined(__gfx9__)
-    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
-    __shared__ uint8_t p_shared[shared_size];
-
-    const index_t block_id = get_block_1d_id();
-    const auto gemm_desc_ptr =
-        reinterpret_cast<const GemmDesc*>(cast_pointer_to_generic_address_space(gemm_descs_const));
-
-    index_t left     = 0;
-    index_t right    = group_count;
-    index_t group_id = index_t((left + right) / 2);
-    while((!(block_id >= gemm_desc_ptr[group_id].block_start_ &&
-             block_id < gemm_desc_ptr[group_id].block_end_)) &&
-          left <= right)
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
     {
-        if(block_id < gemm_desc_ptr[group_id].block_start_)
-        {
-            right = group_id;
-        }
-        else
-        {
-            left = group_id;
-        }
-        group_id = index_t((left + right) / 2);
-    }
+        constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+        __shared__ uint8_t p_shared[shared_size];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
-        gemm_desc_ptr[group_id].karg_,
-        static_cast<void*>(p_shared),
-        gemm_desc_ptr[group_id].block_2_ctile_map_,
-        a_element_op,
-        b_element_op,
-        c_element_op);
+        const index_t block_id   = get_block_1d_id();
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmDesc*>(
+            cast_pointer_to_generic_address_space(gemm_descs_const));
+
+        index_t left     = 0;
+        index_t right    = group_count;
+        index_t group_id = index_t((left + right) / 2);
+        while((!(block_id >= gemm_desc_ptr[group_id].block_start_ &&
+                 block_id < gemm_desc_ptr[group_id].block_end_)) &&
+              left <= right)
+        {
+            if(block_id < gemm_desc_ptr[group_id].block_start_)
+            {
+                right = group_id;
+            }
+            else
+            {
+                left = group_id;
+            }
+            group_id = index_t((left + right) / 2);
+        }
+
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
+            gemm_desc_ptr[group_id].karg_,
+            static_cast<void*>(p_shared),
+            gemm_desc_ptr[group_id].block_2_ctile_map_,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+    }
 #else
     ignore = gemm_descs_const;
     ignore = group_count;
@@ -144,6 +147,9 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                                                                            BElementwiseOperation,
                                                                            CDEElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     static constexpr auto I0 = Number<0>{};
@@ -153,7 +159,8 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
     static_assert(KPerBlock % AK1 == 0);
     static constexpr index_t K0PerBlock = KPerBlock / AK1;
 
-    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
         BlockSize,
         ADataType,
         BDataType,
@@ -174,7 +181,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         NPerXDL,
         AK1,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -197,26 +204,29 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         LoopSched,
         PipelineVer>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using CGridDesc_M_N = typename GridwiseGemm::CGridDesc_M_N;
+    using CGridDesc_M_N = typename GridwiseGemm64::CGridDesc_M_N;
     using Block2ETileMapKSplit =
         BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>;
     // Block2CTileMap configuration parameter.
     static constexpr index_t B2E_M01 = 8;
     using GroupedGemmBlock2ETileMap  = OffsettedBlockToCTileMap<Block2ETileMapKSplit>;
-    using KernelArgument             = typename GridwiseGemm::Argument;
+    using KernelArgument             = typename GridwiseGemm64::Argument;
     using PassThrough                = ck::tensor_operation::element_wise::PassThrough;
-    struct GemmTransKernelArg
+    template <typename KernelArgument_>
+    struct GemmTransKernelArgBase
     {
-        KernelArgument karg_;
+        KernelArgument_ karg_;
         GroupedGemmBlock2ETileMap block_2_ctile_map_;
         index_t block_start_, block_end_;
 
-        GemmTransKernelArg() = default;
-        GemmTransKernelArg(KernelArgument&& karg,
-                           GroupedGemmBlock2ETileMap&& b2c_map,
-                           index_t block_start,
-                           index_t block_end)
+        GemmTransKernelArgBase() = default;
+        GemmTransKernelArgBase(KernelArgument_&& karg,
+                               GroupedGemmBlock2ETileMap&& b2c_map,
+                               index_t block_start,
+                               index_t block_end)
             : karg_{karg},
               block_2_ctile_map_{b2c_map},
               block_start_{block_start},
@@ -224,6 +234,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         {
         }
     };
+    using GemmTransKernelArg = GemmTransKernelArgBase<KernelArgument>;
 
     static constexpr index_t DefaultKBatch = 1;
 
@@ -277,12 +288,13 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 const index_t stride_b = gemm_descs[i].stride_B_;
                 const index_t stride_c = gemm_descs[i].stride_C_;
 
-                const index_t m_padded  = GridwiseGemm::CalculateMPadded(M);
-                const index_t n_padded  = GridwiseGemm::CalculateNPadded(N);
-                const index_t k_padded  = GridwiseGemm::CalculateKPadded(K, K_BATCH);
-                const index_t k0_padded = GridwiseGemm::CalculateK0Padded(K, K_BATCH);
+                const index_t m_padded  = GridwiseGemm64::CalculateMPadded(M);
+                const index_t n_padded  = GridwiseGemm64::CalculateNPadded(N);
+                const index_t k_padded  = GridwiseGemm64::CalculateKPadded(K, K_BATCH);
+                const index_t k0_padded = GridwiseGemm64::CalculateK0Padded(K, K_BATCH);
 
-                const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(M, N, stride_c);
+                const auto c_grid_desc_m_n =
+                    GridwiseGemm64::MakeCGridDescriptor_M_N(M, N, stride_c);
 
                 const auto local_b2c_tile_map =
                     Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
@@ -332,11 +344,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
 
                 auto& karg = gemm_kernel_args_[i].karg_;
 
-                const index_t k_padded  = GridwiseGemm::CalculateKPadded(karg.K, K_BATCH);
-                const index_t k0_padded = GridwiseGemm::CalculateK0Padded(karg.K, K_BATCH);
+                const index_t k_padded  = GridwiseGemm64::CalculateKPadded(karg.K, K_BATCH);
+                const index_t k0_padded = GridwiseGemm64::CalculateK0Padded(karg.K, K_BATCH);
 
                 const auto c_grid_desc_m_n =
-                    GridwiseGemm::MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
+                    GridwiseGemm64::MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
 
                 const auto local_b2c_tile_map =
                     Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
@@ -373,18 +385,25 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg,
-                  const StreamConfig& stream_config = StreamConfig{},
-                  hipStream_t cpy_stream            = nullptr,
-                  hipEvent_t cpy_event              = nullptr)
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{},
+                     hipStream_t cpy_stream            = nullptr,
+                     hipEvent_t cpy_event              = nullptr)
         {
+            using GemmTransKernelArg_ = GemmTransKernelArgBase<typename GridwiseGemm::Argument>;
+            static_assert(sizeof(GemmTransKernelArg_) == sizeof(GemmTransKernelArg));
+            static_assert(sizeof(typename GridwiseGemm::Argument) ==
+                          sizeof(typename GridwiseGemm64::Argument));
+
             index_t K0                       = arg.gemm_kernel_args_[0].karg_.K0Padded;
             bool all_have_kbatch_gt_one      = arg.gemm_kernel_args_[0].karg_.k_batch > 1;
             bool all_have_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
 
             for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
             {
-                const auto& karg = arg.gemm_kernel_args_[i].karg_;
+                const auto& karg = reinterpret_cast<const typename GridwiseGemm::Argument&>(
+                    arg.gemm_kernel_args_[i].karg_);
                 if(stream_config.log_level_ > 0)
                 {
                     karg.Print();
@@ -439,7 +458,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 }
                 hip_check_error(hipMemcpyAsync(arg.p_workspace_,
                                                arg.gemm_kernel_host_args_,
-                                               arg.group_count_ * sizeof(GemmTransKernelArg),
+                                               arg.group_count_ * sizeof(GemmTransKernelArg_),
                                                hipMemcpyHostToDevice,
                                                cpy_stream));
                 hip_check_error(hipEventRecord(cpy_event, cpy_stream));
@@ -451,7 +470,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 hip_check_error(
                     hipMemcpyAsync(arg.p_workspace_,
                                    arg.gemm_kernel_args_.data(),
-                                   arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                                   arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg_),
                                    hipMemcpyHostToDevice,
                                    stream_config.stream_id_));
             }
@@ -490,7 +509,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 {
                     const auto kernel =
                         kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
-                                                       GemmTransKernelArg,
+                                                       GemmTransKernelArg_,
                                                        true,
                                                        InMemoryDataOperationEnum::AtomicAdd>;
 
@@ -500,7 +519,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 {
                     const auto kernel =
                         kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
-                                                       GemmTransKernelArg,
+                                                       GemmTransKernelArg_,
                                                        true,
                                                        InMemoryDataOperationEnum::Set>;
 
@@ -513,7 +532,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 {
                     const auto kernel =
                         kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
-                                                       GemmTransKernelArg,
+                                                       GemmTransKernelArg_,
                                                        false,
                                                        InMemoryDataOperationEnum::AtomicAdd>;
 
@@ -523,7 +542,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 {
                     const auto kernel =
                         kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
-                                                       GemmTransKernelArg,
+                                                       GemmTransKernelArg_,
                                                        false,
                                                        InMemoryDataOperationEnum::Set>;
 
@@ -534,6 +553,28 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
             return ave_time;
         }
 
+        float Run(const Argument& arg,
+                  const StreamConfig& stream_config = StreamConfig{},
+                  hipStream_t cpy_stream            = nullptr,
+                  hipEvent_t cpy_event              = nullptr)
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64>(arg, stream_config, cpy_stream, cpy_event);
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32>(arg, stream_config, cpy_stream, cpy_event);
+                }
+            }
+            return 0;
+        }
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -550,11 +591,14 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
+        {
+            return false;
+        }
+        if(is_gfx11_supported() && arg.K_BATCH > 1)
         {
             return false;
         }
-
         if((ck::type_convert<ck::index_t>(arg.gemm_kernel_args_.size()) +
             arg.skipped_group_count_) != arg.group_count_)
         {
@@ -573,11 +617,27 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         }
 
         bool supported = true;
+        bool isWave64  = get_warp_size() == 64;
         for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
         {
-            const auto& a = arg.gemm_kernel_args_[i].karg_;
+            const auto& a        = arg.gemm_kernel_args_[i].karg_;
+            bool group_arg_valid = false;
+            if(isWave64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    group_arg_valid = GridwiseGemm64::CheckValidity(a);
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    group_arg_valid = GridwiseGemm32::CheckValidity(
+                        reinterpret_cast<const typename GridwiseGemm32::Argument&>(a));
+                }
+            }
 
-            bool group_arg_valid = GridwiseGemm::CheckValidity(a);
             if(not group_arg_valid)
             {
                 if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
index 27d3c378ac..748ae28a50 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
@@ -87,8 +87,12 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                                    BElementwiseOperation,
                                                                    CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
-    using GridwiseGemm =
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase =
         GridwiseMoeGemm<ALayout,
                         BLayout,
                         DsLayout,
@@ -112,7 +116,7 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                         MPerXDL,
                         NPerXDL,
                         MXdlPerWave,
-                        NXdlPerWave,
+                        NXdlPerWave_,
                         ABlockTransferThreadClusterLengths_AK0_M_AK1,
                         ABlockTransferThreadClusterArrangeOrder,
                         ABlockTransferSrcAccessOrder,
@@ -130,7 +134,7 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                         false,
                         BBlockLdsExtraN,
                         CShuffleMXdlPerWavePerShuffle,
-                        CShuffleNXdlPerWavePerShuffle,
+                        math::min(CShuffleNXdlPerWavePerShuffle, NXdlPerWave_),
                         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                         CDEShuffleBlockTransferScalarPerVectors,
                         BlkGemmPipeSched,
@@ -145,8 +149,10 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                         ComputeTypeB,
                         LDSTypeA,
                         LDSTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     static constexpr index_t APackedSize = []() {
         if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
@@ -167,7 +173,9 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -195,7 +203,7 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
 
                     std::array<std::size_t, NumDTensor> DsSize;
 
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -214,8 +222,13 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                         using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
                         DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                     });
-                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    ck::utility::RotatingMemWrapperMultiD<typename GridwiseGemm::Argument,
+                                                          DsDataType>
+                        rotating_mem(arg_,
+                                     stream_config.rotating_count,
+                                     size_a_buffer,
+                                     size_b_buffer,
+                                     DsSize);
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -376,6 +389,8 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -397,11 +412,10 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
         {
             return false;
         }
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -418,8 +432,22 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
         {
             return false;
         }
-
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -553,7 +581,7 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
index efa85a357c..2475ae2dc5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
@@ -98,68 +98,74 @@ struct DeviceMoeGemmBlockScale
                                                         BElementwiseOperation,
                                                         CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
-    using GridwiseGemm                  = GridwiseMoeGemmBlockScale<
-                         ALayout,
-                         BLayout,
-                         DsLayout,
-                         CLayout,
-                         ADataType,
-                         BDataType,
-                         GemmAccDataType,
-                         CShuffleDataType,
-                         DsDataType,
-                         CDataType,
-                         AElementwiseOperation,
-                         BElementwiseOperation,
-                         CElementwiseOperation,
-                         GemmSpec,
-                         BlockSize,
-                         ScaleBlockM,
-                         ScaleBlockN,
-                         ScaleBlockK,
-                         MPerBlock,
-                         NPerBlock,
-                         KPerBlock,
-                         AK1,
-                         BK1,
-                         MPerXDL,
-                         NPerXDL,
-                         MXdlPerWave,
-                         NXdlPerWave,
-                         ABlockTransferThreadClusterLengths_AK0_M_AK1,
-                         ABlockTransferThreadClusterArrangeOrder,
-                         ABlockTransferSrcAccessOrder,
-                         ABlockTransferSrcVectorDim,
-                         ABlockTransferSrcScalarPerVector,
-                         ABlockTransferDstScalarPerVector_AK1,
-                         false,
-                         ABlockLdsExtraM,
-                         BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                         BBlockTransferThreadClusterArrangeOrder,
-                         BBlockTransferSrcAccessOrder,
-                         BBlockTransferSrcVectorDim,
-                         BBlockTransferSrcScalarPerVector,
-                         BBlockTransferDstScalarPerVector_BK1,
-                         false,
-                         BBlockLdsExtraN,
-                         CShuffleMXdlPerWavePerShuffle,
-                         CShuffleNXdlPerWavePerShuffle,
-                         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-                         CDEShuffleBlockTransferScalarPerVectors,
-                         BlkGemmPipeSched,
-                         BlkGemmPipelineVer,
-                         ActivationOP,
-                         NSwizzle,
-                         IsInputGemm,
-                         MulRoutedWeight,
-                         IndexType,
-                         ComputeTypeA,
-                         ComputeTypeB,
-                         LDSTypeA,
-                         LDSTypeB>;
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseMoeGemmBlockScale<
+        ALayout,
+        BLayout,
+        DsLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        DsDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        ScaleBlockM,
+        ScaleBlockN,
+        ScaleBlockK,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave_,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        math::min(CShuffleNXdlPerWavePerShuffle, NXdlPerWave_),
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ActivationOP,
+        NSwizzle,
+        IsInputGemm,
+        MulRoutedWeight,
+        IndexType,
+        ComputeTypeA,
+        ComputeTypeB,
+        LDSTypeA,
+        LDSTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     static constexpr index_t APackedSize = []() {
         if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
@@ -180,7 +186,9 @@ struct DeviceMoeGemmBlockScale
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -207,7 +215,7 @@ struct DeviceMoeGemmBlockScale
 
                     std::array<std::size_t, NumDTensor> DsSize;
 
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -226,8 +234,13 @@ struct DeviceMoeGemmBlockScale
                         using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
                         DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                     });
-                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    ck::utility::RotatingMemWrapperMultiD<typename GridwiseGemm::Argument,
+                                                                      DsDataType>
+                        rotating_mem(arg_,
+                                     stream_config.rotating_count,
+                                     size_a_buffer,
+                                     size_b_buffer,
+                                     DsSize);
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -385,6 +398,8 @@ struct DeviceMoeGemmBlockScale
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -406,11 +421,10 @@ struct DeviceMoeGemmBlockScale
         {
             return false;
         }
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -428,7 +442,22 @@ struct DeviceMoeGemmBlockScale
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -572,7 +601,7 @@ struct DeviceMoeGemmBlockScale
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp
index e7be94242b..9c14106033 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp
@@ -90,8 +90,12 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
                                                            BElementwiseOperation,
                                                            CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
-    using GridwiseGemm =
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase =
         GridwiseMoeGemmMX<ALayout,
                           BLayout,
                           DsLayout,
@@ -118,7 +122,7 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
                           MPerXDL,
                           NPerXDL,
                           MXdlPerWave,
-                          NXdlPerWave,
+                          NXdlPerWave_,
                           ABlockTransferThreadClusterLengths_AK0_M_AK1,
                           ABlockTransferThreadClusterArrangeOrder,
                           ABlockTransferSrcAccessOrder,
@@ -148,9 +152,10 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
                           IndexType,
                           ComputeTypeA,
                           ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
-
+    using Argument                       = typename GridwiseGemm64::Argument;
     static constexpr index_t APackedSize = packed_size_v<ADataType>;
     static constexpr index_t BPackedSize = packed_size_v<BDataType>;
 
@@ -159,7 +164,9 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -187,7 +194,7 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
 
                     std::array<std::size_t, NumDTensor> DsSize;
 
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -206,8 +213,13 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
                         using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
                         DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                     });
-                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    ck::utility::RotatingMemWrapperMultiD<typename GridwiseGemm::Argument,
+                                                          DsDataType>
+                        rotating_mem(arg_,
+                                     stream_config.rotating_count,
+                                     size_a_buffer,
+                                     size_b_buffer,
+                                     DsSize);
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -333,6 +345,8 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -354,11 +368,10 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
         {
             return false;
         }
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -376,7 +389,22 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -530,7 +558,7 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp
index bb7dcae9de..55bd9e0116 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp
@@ -90,8 +90,12 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
                                                               BElementwiseOperation,
                                                               CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
-    using GridwiseGemm =
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase =
         GridwiseMoeGemmMXBNS<ALayout,
                              BLayout,
                              DsLayout,
@@ -118,7 +122,7 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
                              MPerXDL,
                              NPerXDL,
                              MXdlPerWave,
-                             NXdlPerWave,
+                             NXdlPerWave_,
                              ABlockTransferThreadClusterLengths_AK0_M_AK1,
                              ABlockTransferThreadClusterArrangeOrder,
                              ABlockTransferSrcAccessOrder,
@@ -148,8 +152,10 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
                              IndexType,
                              ComputeTypeA,
                              ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     static constexpr index_t APackedSize = packed_size_v<ADataType>;
     static constexpr index_t BPackedSize = packed_size_v<BDataType>;
@@ -159,7 +165,9 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -187,7 +195,7 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
 
                     std::array<std::size_t, NumDTensor> DsSize;
 
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -206,8 +214,13 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
                         using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
                         DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                     });
-                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    ck::utility::RotatingMemWrapperMultiD<typename GridwiseGemm::Argument,
+                                                          DsDataType>
+                        rotating_mem(arg_,
+                                     stream_config.rotating_count,
+                                     size_a_buffer,
+                                     size_b_buffer,
+                                     DsSize);
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -331,6 +344,8 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -352,11 +367,10 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
         {
             return false;
         }
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -374,7 +388,22 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -528,7 +557,7 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
index 4bf38d9d1f..214b78c38b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
@@ -90,66 +90,72 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
                                                                       BElementwiseOperation,
                                                                       CElementwiseOperation>
 {
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
-    using GridwiseGemm                  = GridwiseMoeGemmMX_BPreshuffle<
-                         ALayout,
-                         BLayout,
-                         DsLayout,
-                         CLayout,
-                         ADataType,
-                         AScaleDataType,
-                         BDataType,
-                         BScaleDataType,
-                         GemmAccDataType,
-                         CShuffleDataType,
-                         DsDataType,
-                         CDataType,
-                         AElementwiseOperation,
-                         BElementwiseOperation,
-                         CElementwiseOperation,
-                         GemmSpec,
-                         ScaleBlockSize,
-                         BlockSize,
-                         MPerBlock,
-                         NPerBlock,
-                         KPerBlock,
-                         AK1,
-                         BK1,
-                         MPerXDL,
-                         NPerXDL,
-                         MXdlPerWave,
-                         NXdlPerWave,
-                         ABlockTransferThreadClusterLengths_AK0_M_AK1,
-                         ABlockTransferThreadClusterArrangeOrder,
-                         ABlockTransferSrcAccessOrder,
-                         ABlockTransferSrcVectorDim,
-                         ABlockTransferSrcScalarPerVector,
-                         ABlockTransferDstScalarPerVector_AK1,
-                         false,
-                         ABlockLdsExtraM,
-                         BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                         BBlockTransferThreadClusterArrangeOrder,
-                         BBlockTransferSrcAccessOrder,
-                         BBlockTransferSrcVectorDim,
-                         BBlockTransferSrcScalarPerVector,
-                         BBlockTransferDstScalarPerVector_BK1,
-                         false,
-                         BBlockLdsExtraN,
-                         CShuffleMXdlPerWavePerShuffle,
-                         CShuffleNXdlPerWavePerShuffle,
-                         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-                         CDEShuffleBlockTransferScalarPerVectors,
-                         BlkGemmPipeSched,
-                         BlkGemmPipelineVer,
-                         ActivationOP,
-                         NSwizzle,
-                         IsInputGemm,
-                         MulRoutedWeight,
-                         IndexType,
-                         ComputeTypeA,
-                         ComputeTypeB>;
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseMoeGemmMX_BPreshuffle<
+        ALayout,
+        BLayout,
+        DsLayout,
+        CLayout,
+        ADataType,
+        AScaleDataType,
+        BDataType,
+        BScaleDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        DsDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        ScaleBlockSize,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave_,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ActivationOP,
+        NSwizzle,
+        IsInputGemm,
+        MulRoutedWeight,
+        IndexType,
+        ComputeTypeA,
+        ComputeTypeB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     static constexpr index_t APackedSize = packed_size_v<ADataType>;
     static constexpr index_t BPackedSize = packed_size_v<BDataType>;
@@ -159,7 +165,9 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -187,7 +195,7 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
 
                     std::array<std::size_t, NumDTensor> DsSize;
 
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -206,8 +214,13 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
                         using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
                         DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                     });
-                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    ck::utility::RotatingMemWrapperMultiD<typename GridwiseGemm::Argument,
+                                                          DsDataType>
+                        rotating_mem(arg_,
+                                     stream_config.rotating_count,
+                                     size_a_buffer,
+                                     size_b_buffer,
+                                     DsSize);
                     rotating_mem.Print();
 
                     auto run_flush_cache = [&]() {
@@ -358,6 +371,8 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
             return ave_time;
         }
 
+        INVOKER_RUN3_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -379,11 +394,10 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
         {
             return false;
         }
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ComputeTypeA, ComputeTypeB, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
         if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
         {
             return false;
@@ -401,7 +415,22 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+        }
+        return false;
     }
 
     // polymorphic
@@ -555,7 +584,7 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
index b60370fd8e..cff386c7a7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -56,43 +56,48 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const Block2ETileMap block_2_etile_map)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+        const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
-    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+        const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
-    FloatDsPointer p_ds_grid_grp;
+        FloatDsPointer p_ds_grid_grp;
 
-    static constexpr index_t NumDTensor =
-        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+        static constexpr index_t NumDTensor =
+            DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
 
-    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_ds_grid_grp,
-                                                  p_e_grid + e_batch_offset,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_akb_ak0_m_ak1,
-                                                  b_grid_desc_bkb_bk0_n_bk1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid + a_batch_offset,
+            p_b_grid + b_batch_offset,
+            p_ds_grid_grp,
+            p_e_grid + e_batch_offset,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_akb_ak0_m_ak1,
+            b_grid_desc_bkb_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_etile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -190,7 +195,9 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
                                               CDEElementwiseOperation>
 {
     using DeviceOp = DeviceSplitKContractionMultipleD_Xdl_CShuffle;
-
+    GET_NXDL_PER_WAVE_IMPL
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
     static constexpr index_t NumDTensor = DsDataType::Size();
 
     static constexpr auto I0 = Number<0>{};
@@ -521,7 +528,8 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
     };
 
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmSplitKMultipleD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemmSplitKMultipleD_xdl_cshuffle<
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
         CShuffleDataType,
@@ -545,7 +553,7 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -567,9 +575,12 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
     // GridwiseGemm
-    using GridwiseGemmAtomicAdd = GridwiseGemmSplitKMultipleD_xdl_cshuffle<
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmAtomicAddBase = GridwiseGemmSplitKMultipleD_xdl_cshuffle<
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
         CShuffleDataType,
@@ -593,7 +604,7 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -615,19 +626,39 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
         CDEBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
+    using GridwiseGemmAtomicAdd64 = GridwiseGemmAtomicAddBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemmAtomicAdd32 = GridwiseGemmAtomicAddBase<NXdlPerWave32>;
 
     using AGridDesc_AKB_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AKB_AK0_M_AK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultAGridDescriptor_AKB_AK0_M_AK1(
             AGridDesc_M_K{}, 1))>;
     using BGridDesc_BKB_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BKB_BK0_N_BK1(
+        remove_cvref_t<decltype(GridwiseGemm64::MakeDefaultBGridDescriptor_BKB_BK0_N_BK1(
             BGridDesc_N_K{}, 1))>;
 
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+    using Block2ETileMap = typename GridwiseGemm64::DefaultBlock2ETileMap;
 
     // Argument
     struct Argument : public BaseArgument
     {
+        template <typename GridwiseGemm>
+        void init_ds_e_grid_desc()
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_akb_ak0_m_ak1_,
+                                           b_grid_desc_bkb_bk0_n_bk1_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+            }
+        }
         Argument(const void* p_a_grid,
                  const void* p_b_grid,
                  std::array<const void*, NumDTensor> p_ds_grid,
@@ -659,14 +690,14 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
                   DeviceOp::MakeDsGridDescriptor_G_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides)},
               e_grid_desc_g_m_n_{
                   DeviceOp::MakeEGridDescriptor_G_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)},
-              a_grid_desc_akb_ak0_m_ak1_{GridwiseGemm::MakeDefaultAGridDescriptor_AKB_AK0_M_AK1(
+              a_grid_desc_akb_ak0_m_ak1_{GridwiseGemm64::MakeDefaultAGridDescriptor_AKB_AK0_M_AK1(
                   a_grid_desc_m_k_, split_k)},
-              b_grid_desc_bkb_bk0_n_bk1_{GridwiseGemm::MakeDefaultBGridDescriptor_BKB_BK0_N_BK1(
+              b_grid_desc_bkb_bk0_n_bk1_{GridwiseGemm64::MakeDefaultBGridDescriptor_BKB_BK0_N_BK1(
                   b_grid_desc_n_k_, split_k)},
               ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
               e_grid_desc_mblock_mperblock_nblock_nperblock_{},
               block_2_etile_map_{
-                  GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_, split_k)},
+                  GridwiseGemm64::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_, split_k)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op},
@@ -697,19 +728,19 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
             });
 
             // populate desc for Ds/E
-            if(GridwiseGemm::CheckValidity(a_grid_desc_akb_ak0_m_ak1_,
-                                           b_grid_desc_bkb_bk0_n_bk1_,
-                                           ds_grid_desc_m_n_,
-                                           e_grid_desc_m_n_,
-                                           block_2_etile_map_))
+            if(get_warp_size() == 64)
             {
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-
-                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        ds_grid_desc_m_n_);
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    init_ds_e_grid_desc<GridwiseGemm64>();
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    init_ds_e_grid_desc<GridwiseGemm32>();
+                }
             }
 
             // for sanity check of vector memory access
@@ -755,7 +786,7 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemm64::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptors for problem definiton
@@ -770,9 +801,9 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
         // tensor descriptors for block/thread-wise copy
         AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1_;
         BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        typename GridwiseGemm64::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
             ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        typename GridwiseGemm64::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
             e_grid_desc_mblock_mperblock_nblock_nperblock_;
 
         // block-to-e-tile map
@@ -806,7 +837,8 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_akb_ak0_m_ak1_,
                                             arg.b_grid_desc_bkb_bk0_n_bk1_,
@@ -818,6 +850,11 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
                     "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
             }
 
+            using GridwiseGemmAtomicAdd =
+                std::conditional_t<std::is_same_v<GridwiseGemm, GridwiseGemm64>,
+                                   GridwiseGemmAtomicAdd64,
+                                   GridwiseGemmAtomicAdd32>;
+
             const index_t G = arg.e_grid_desc_g_m_n_.GetLength(I0);
 
             const index_t grid_size =
@@ -931,6 +968,8 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
             }
         }
 
+        INVOKER_RUN_IMPL
+
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -941,19 +980,35 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_xdl_supported())
+        if(!ck::is_xdl_wmma_supported<ADataType, BDataType, MPerXDL, NPerXDL>())
         {
             return false;
         }
-
-        if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_akb_ak0_m_ak1_,
-                                        arg.b_grid_desc_bkb_bk0_n_bk1_,
-                                        arg.ds_grid_desc_m_n_,
-                                        arg.e_grid_desc_m_n_,
-                                        arg.block_2_etile_map_))
+        bool valid = false;
+        if(get_warp_size() == 64)
         {
-            return false;
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                valid = GridwiseGemm64::CheckValidity(arg.a_grid_desc_akb_ak0_m_ak1_,
+                                                      arg.b_grid_desc_bkb_bk0_n_bk1_,
+                                                      arg.ds_grid_desc_m_n_,
+                                                      arg.e_grid_desc_m_n_,
+                                                      arg.block_2_etile_map_);
+            }
         }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                valid = GridwiseGemm32::CheckValidity(arg.a_grid_desc_akb_ak0_m_ak1_,
+                                                      arg.b_grid_desc_bkb_bk0_n_bk1_,
+                                                      arg.ds_grid_desc_m_n_,
+                                                      arg.e_grid_desc_m_n_,
+                                                      arg.block_2_etile_map_);
+            }
+        }
+        if(!valid)
+            return false;
 
         // check vector access
         static_assert((ABlockTransferSrcVectorDim == 2 || ABlockTransferSrcVectorDim == 3) &&
diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
index e836e73a1d..79deb81512 100644
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -8,21 +8,31 @@ namespace tensor_layout {
 
 struct BaseTensorLayout
 {
+    static constexpr const char* name = "BaseTensorLayout";
+};
+
+struct BypassLayoutVerification : public BaseTensorLayout
+{
+    static constexpr const char* name = "BypassLayoutVerification";
 };
 
 namespace gemm {
 
-struct RowMajor : public BaseTensorLayout
+struct BaseGemmLayout : public BaseTensorLayout
+{
+    static constexpr const char* name = "BaseConvolutionLayout";
+};
+struct RowMajor : public BaseGemmLayout
 {
     static constexpr const char* name = "RowMajor";
 };
 
-struct ColumnMajor : public BaseTensorLayout
+struct ColumnMajor : public BaseGemmLayout
 {
     static constexpr const char* name = "ColumnMajor";
 };
 
-struct MFMA : public BaseTensorLayout
+struct MFMA : public BaseGemmLayout
 {
     static constexpr const char* name = "MFMA";
 };
@@ -31,405 +41,410 @@ struct MFMA : public BaseTensorLayout
 
 namespace convolution {
 
+struct BaseConvolutionLayout : public BaseTensorLayout
+{
+    static constexpr const char* name = "BaseConvolutionLayout";
+};
+
 // input tensor
 // packed NCW/NCHW/NCDHW
-struct NCW : public BaseTensorLayout
+struct NCW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NCW";
 };
 
-struct NCHW : public BaseTensorLayout
+struct NCHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NCHW";
 };
 
-struct NCDHW : public BaseTensorLayout
+struct NCDHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NCDHW";
 };
 
 // packed GNCW/GNCHW/GNCDHW
-struct GNCW : public BaseTensorLayout
+struct GNCW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNCW";
 };
 
-struct GNCHW : public BaseTensorLayout
+struct GNCHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNCHW";
 };
 
-struct GNCDHW : public BaseTensorLayout
+struct GNCDHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNCDHW";
 };
 
 // input tensor
 // packed NWC/NHWC/NDHWC
-struct NWC : public BaseTensorLayout
+struct NWC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NWC";
 };
 
-struct NHWC : public BaseTensorLayout
+struct NHWC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NHWC";
 };
 
-struct NDHWC : public BaseTensorLayout
+struct NDHWC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NDHWC";
 };
 
 // input tensor
 // packed GNWC/GNHWC/GNDHWC
-struct GNWC : public BaseTensorLayout
+struct GNWC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNWC";
 };
 
-struct GNHWC : public BaseTensorLayout
+struct GNHWC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNHWC";
 };
 
-struct GNDHWC : public BaseTensorLayout
+struct GNDHWC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNDHWC";
 };
 
 // for input bias
-struct GC : public BaseTensorLayout
+struct GC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GC";
 };
 
 // input tensor
 // packed NWGC/NHWGC/NDHWGC
-struct NWGC : public BaseTensorLayout
+struct NWGC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NWGC";
 };
 
-struct NHWGC : public BaseTensorLayout
+struct NHWGC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NHWGC";
 };
 
-struct NDHWGC : public BaseTensorLayout
+struct NDHWGC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NDHWGC";
 };
 
 // input tensor
 // packed NGCW/NGCHW/NGCDHW
-struct NGCW : public BaseTensorLayout
+struct NGCW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NGCW";
 };
 
-struct NGCHW : public BaseTensorLayout
+struct NGCHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NGCHW";
 };
 
-struct NGCDHW : public BaseTensorLayout
+struct NGCDHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NGCDHW";
 };
 
 // input tensor
 // strided layout
-struct G_NW_C : public BaseTensorLayout
+struct G_NW_C : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_NW_C";
 };
 
-struct G_NHW_C : public BaseTensorLayout
+struct G_NHW_C : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_NHW_C";
 };
 
-struct G_NDHW_C : public BaseTensorLayout
+struct G_NDHW_C : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_NDHW_C";
 };
 
 // for input bias
-struct G_C : public BaseTensorLayout
+struct G_C : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_C";
 };
 
 // weight tensor
 // packed KCX/KCYX/KCZYX
-struct KCX : public BaseTensorLayout
+struct KCX : public BaseConvolutionLayout
 {
     static constexpr const char* name = "KCX";
 };
 
-struct KCYX : public BaseTensorLayout
+struct KCYX : public BaseConvolutionLayout
 {
     static constexpr const char* name = "KCYX";
 };
 
-struct KCZYX : public BaseTensorLayout
+struct KCZYX : public BaseConvolutionLayout
 {
     static constexpr const char* name = "KCZYX";
 };
 
 // weight tensor
 // packed KCX/KCYX/KCZYX
-struct GKCX : public BaseTensorLayout
+struct GKCX : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GKCX";
 };
 
-struct GKCYX : public BaseTensorLayout
+struct GKCYX : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GKCYX";
 };
 
-struct GKCZYX : public BaseTensorLayout
+struct GKCZYX : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GKCZYX";
 };
 
 // weight tensor
 // packed KXC/KYXC/KZYXC
-struct KXC : public BaseTensorLayout
+struct KXC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "KXC";
 };
 
-struct KYXC : public BaseTensorLayout
+struct KYXC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "KYXC";
 };
 
-struct KZYXC : public BaseTensorLayout
+struct KZYXC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "KZYXC";
 };
 
 // weight tensor
 // packed GKXC/GKYXC/GKZYXC
-struct GKXC : public BaseTensorLayout
+struct GKXC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GKXC";
 };
 
-struct GKYXC : public BaseTensorLayout
+struct GKYXC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GKYXC";
 };
 
-struct GKZYXC : public BaseTensorLayout
+struct GKZYXC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GKZYXC";
 };
 
 // weight tensor
 // packed KXGC/KYXGC/KZYXGC
-struct KXGC : public BaseTensorLayout
+struct KXGC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "KXGC";
 };
 
-struct KYXGC : public BaseTensorLayout
+struct KYXGC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "KYXGC";
 };
 
-struct KZYXGC : public BaseTensorLayout
+struct KZYXGC : public BaseConvolutionLayout
 {
     static constexpr const char* name = "KZYXGC";
 };
 
 // weight tensor
 // strided
-struct G_K_X_C : public BaseTensorLayout
+struct G_K_X_C : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_K_X_C";
 };
 
-struct G_K_YX_C : public BaseTensorLayout
+struct G_K_YX_C : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_K_YX_C";
 };
 
-struct G_K_ZYX_C : public BaseTensorLayout
+struct G_K_ZYX_C : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_K_ZYX_C";
 };
 
 // output tensor
 // packed NKW/NKHW/NKDHW
-struct NKW : public BaseTensorLayout
+struct NKW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NKW";
 };
 
-struct NKHW : public BaseTensorLayout
+struct NKHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NKHW";
 };
 
-struct NKDHW : public BaseTensorLayout
+struct NKDHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NKDHW";
 };
 
 // output tensor
 // packed GNKW/GNKHW/GNKDHW
-struct GNKW : public BaseTensorLayout
+struct GNKW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNKW";
 };
 
-struct GNKHW : public BaseTensorLayout
+struct GNKHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNKHW";
 };
 
-struct GNKDHW : public BaseTensorLayout
+struct GNKDHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNKDHW";
 };
 
 // output tensor
 // packed NWK/NHWK/NDHWK
-struct NWK : public BaseTensorLayout
+struct NWK : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NWK";
 };
 
-struct NHWK : public BaseTensorLayout
+struct NHWK : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NHWK";
 };
 
-struct NDHWK : public BaseTensorLayout
+struct NDHWK : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NDHWK";
 };
 
 // output tensor
 // packed GNWK/GNHWK/GNDHWK
-struct GNWK : public BaseTensorLayout
+struct GNWK : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNWK";
 };
 
-struct GNHWK : public BaseTensorLayout
+struct GNHWK : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNHWK";
 };
 
-struct GNDHWK : public BaseTensorLayout
+struct GNDHWK : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNDHWK";
 };
 
 // output tensor
 // packed NWGK/NHWGK/NDHWGK
-struct NWGK : public BaseTensorLayout
+struct NWGK : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NWGK";
 };
 
-struct NHWGK : public BaseTensorLayout
+struct NHWGK : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NHWGK";
 };
 
-struct NDHWGK : public BaseTensorLayout
+struct NDHWGK : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NDHWGK";
 };
 
-struct NGKW : public BaseTensorLayout
+struct NGKW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NGKW";
 };
 
-struct NGKHW : public BaseTensorLayout
+struct NGKHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NGKHW";
 };
 
-struct NGKDHW : public BaseTensorLayout
+struct NGKDHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NGKDHW";
 };
 
 // output tensor
 // strided layout
-struct G_NW_K : public BaseTensorLayout
+struct G_NW_K : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_NW_K";
 };
 
-struct G_NHW_K : public BaseTensorLayout
+struct G_NHW_K : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_NHW_K";
 };
 
-struct G_NDHW_K : public BaseTensorLayout
+struct G_NDHW_K : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_NDHW_K";
 };
 
 // for output bias
-struct G_K : public BaseTensorLayout
+struct G_K : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_K";
 };
 
 // K-reduced output tensor (packed)
-struct GNW : public BaseTensorLayout
+struct GNW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNW";
 };
 
-struct GNHW : public BaseTensorLayout
+struct GNHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNHW";
 };
 
-struct GNDHW : public BaseTensorLayout
+struct GNDHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "GNDHW";
 };
 
 // K-reduced output tensor (packed)
-struct NWG : public BaseTensorLayout
+struct NWG : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NWG";
 };
 
-struct NHWG : public BaseTensorLayout
+struct NHWG : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NHWG";
 };
 
-struct NDHWG : public BaseTensorLayout
+struct NDHWG : public BaseConvolutionLayout
 {
     static constexpr const char* name = "NDHWG";
 };
 
 // K-reduced output tensor (strided)
-struct G_NW : public BaseTensorLayout
+struct G_NW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_NW";
 };
 
-struct G_NHW : public BaseTensorLayout
+struct G_NHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_NHW";
 };
 
-struct G_NDHW : public BaseTensorLayout
+struct G_NDHW : public BaseConvolutionLayout
 {
     static constexpr const char* name = "G_NDHW";
 };
diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index d86f01e255..61d249fc93 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -47,7 +47,7 @@ struct Add
     __host__ __device__ constexpr void
     operator()<half_t>(half_t& y, const float& x0, const half_t& x1) const
     {
-        y = type_convert<half_t>(x0) + x1;
+        y = x0 + type_convert<float>(x1);
     };
 
     template <>
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 4a87e8a277..81dd5e5dbb 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -95,7 +95,33 @@ __device__ inline f8x4_t i4_to_f8x4(int q)
     return amd_assembly_cvt_f8_to_f32(f32_0, f32_1, f32_2, f32_3);
 }
 
-__device__ inline f8x8_t i4_to_fp8x8(int q) { return amd_assembly_i4_to_fp8x8(q); }
+__device__ inline f8x8_t i4_to_fp8x8(int q)
+{
+#if defined(__gfx12__)
+    uint32_t fp8x4_0;
+    uint32_t fp8x4_1;
+    // todo: replace amd_assemble_cvt_f32_i4 with __builtin_amdgcn_cvt_off_f32_i4
+    float f32_0 = amd_assemble_cvt_f32_i4(q);
+    float f32_1 = amd_assemble_cvt_f32_i4(q >> 16);
+    fp8x4_0     = __builtin_amdgcn_cvt_pk_fp8_f32(f32_0, f32_1, 0, 0);
+    float f32_2 = amd_assemble_cvt_f32_i4(q >> 8);
+    float f32_3 = amd_assemble_cvt_f32_i4(q >> 24);
+    fp8x4_1     = __builtin_amdgcn_cvt_pk_fp8_f32(f32_2, f32_3, 0, 0);
+    q           = q >> 4;
+    f32_0       = amd_assemble_cvt_f32_i4(q);
+    f32_1       = amd_assemble_cvt_f32_i4(q >> 16);
+    fp8x4_0     = __builtin_amdgcn_cvt_pk_fp8_f32(f32_0, f32_1, fp8x4_0, 1);
+    f32_2       = amd_assemble_cvt_f32_i4(q >> 8);
+    f32_3       = amd_assemble_cvt_f32_i4(q >> 24);
+    fp8x4_1     = __builtin_amdgcn_cvt_pk_fp8_f32(f32_2, f32_3, fp8x4_1, 1);
+    return bit_cast<f8x8_t>(((static_cast<uint64_t>(fp8x4_1) << 32) | fp8x4_0));
+#elif defined(__gfx11__)
+    ignore = q;
+    return f8x8_t{};
+#else
+    return amd_assembly_i4_to_fp8x8(q);
+#endif
+}
 
 __device__ inline bhalf4_t i4_to_bhalf4(int q)
 {
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index 7eca68bbf8..b6bc634d74 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -169,7 +169,8 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
     }
 
     template <typename CGridDesc_M_N>
-    __host__ constexpr bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
+    __host__ __device__ constexpr bool
+    CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
     {
         return true;
     }
diff --git a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
index 36dc8aa6ba..7a09b84a63 100644
--- a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -267,6 +267,8 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
             e_grid_desc_m_n);
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CShuffleDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2ETileMap>
     __host__ __device__ static constexpr bool CheckValidity(const AGridDesc_M_K& a_grid_desc_m_k,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp
new file mode 100644
index 0000000000..fa7eb4faaa
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp
@@ -0,0 +1,1140 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdarg>
+#include "ck/utility/env.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+// Gemm0: A [M x K] x B0 [K x L] = Acc [M x L]
+// Gemm1: Acc [M x L] x B1 [L x N] = C [M x N]
+template <typename ADataType,
+          typename B0DataType,
+          typename Acc0DataType,
+          typename B1DataType,
+          typename Acc1DataType,
+          typename CShuffleDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc,
+          typename B0GridDesc,
+          typename B1GridDesc,
+          typename CGridDesc_M_N,
+          index_t MPerBlock,
+          index_t LPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t NPerBlock,
+          index_t LTilePerBlock,
+          index_t L1Value,
+          index_t MPerWmma,
+          index_t LPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t LRepeat,
+          index_t NRepeat,
+          index_t BlockSize,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
+          typename B0BlockTransferThreadClusterLengths_K0_L_K1,
+          typename B0BlockTransferThreadClusterArrangeOrder,
+          typename B0BlockTransferSrcAccessOrder,
+          index_t B0BlockTransferSrcVectorDim,
+          index_t B0BlockTransferSrcScalarPerVector,
+          index_t B0BlockTransferDstScalarPerVector_K1,
+          bool B0ThreadTransferSrcResetCoordinateAfterRun,
+          bool B0BlockLdsExtraL,
+          typename B1BlockTransferThreadClusterLengths_L0_N_L1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_L1,
+          bool B1ThreadTransferSrcResetCoordinateAfterRun,
+          bool B1BlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          bool PadN,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1>
+struct GridwiseBatchedGemmGemm_wmma_cshuffle_v3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+
+    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
+    static constexpr auto AK1 = Number<AK1Value>{};
+    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
+    static constexpr auto BK1 = Number<BK1Value>{};
+
+    static constexpr auto L0PerBlock = LTilePerBlock / L1Value;
+    static constexpr auto AL0 = Number<L0PerBlock / 2>{}; // TODO: Where does this 2 come from?
+    static constexpr auto AL1 = Number<L1Value>{};
+    static constexpr auto BL0 = Number<L0PerBlock>{};
+    static constexpr auto BL1 = Number<L1Value>{};
+
+    static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma);
+    static constexpr auto LWaves = LPerBlock / (LRepeat * LPerWmma);
+    static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+    // TODO: I am pretty sure this is always 16 and *should* always be 16.
+    static constexpr auto KPack =
+        math::integer_least_multiple(math::integer_least_multiple(AK1Value, BK1Value), 16);
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    __host__ __device__ static constexpr auto MakeABlockDescriptor()
+    {
+        constexpr auto a_block_desc = [&]() {
+            // K0->M->K1 Per Block
+            constexpr auto K0PerBlock    = KPerBlock / AK1;
+            constexpr auto max_lds_align = AK1;
+
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, AK1),
+                    make_tuple(Number<MPerBlock + 1>{} * AK1, AK1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, AK1), max_lds_align);
+            }
+        }();
+
+        return a_block_desc;
+    }
+
+    __host__ __device__ static constexpr auto MakeB0BlockDescriptor()
+    {
+        constexpr auto b0_block_desc = [&]() {
+            // K0->L->BK1 Per Block
+            constexpr auto K0PerBlock    = KPerBlock / BK1;
+            constexpr auto max_lds_align = BK1;
+
+            if constexpr(B0BlockLdsExtraL)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<LPerBlock>{}, BK1),
+                    make_tuple(Number<LPerBlock + 1>{} * BK1, BK1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<LPerBlock>{}, BK1), max_lds_align);
+            }
+        }();
+
+        return b0_block_desc;
+    }
+
+    __host__ __device__ static constexpr auto MakeB1BlockDescriptor()
+    {
+        constexpr auto b1_block_desc = [&]() {
+            // L0->N->BL1 Per Block
+            constexpr auto max_lds_align = BL1;
+
+            if constexpr(B1BlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<L0PerBlock>{}, Number<NPerBlock>{}, BL1),
+                    make_tuple(Number<NPerBlock + 1>{} * BL1, BL1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<L0PerBlock>{}, Number<NPerBlock>{}, BL1), max_lds_align);
+            }
+        }();
+
+        return b1_block_desc;
+    }
+
+    __host__ __device__ static constexpr auto MakeABlockSliceCopyStep()
+    {
+        constexpr auto a_block_copy_step = [&]() { return make_multi_index(AK0, 0, 0); }();
+        return a_block_copy_step;
+    }
+
+    __host__ __device__ static constexpr auto MakeB0BlockSliceCopyStep()
+    {
+        constexpr auto b0_block_copy_step = [&]() { return make_multi_index(BK0, 0, 0); }();
+        return b0_block_copy_step;
+    }
+
+    __host__ __device__ static constexpr auto MakeB1BlockSliceCopyStep()
+    {
+        constexpr auto b1_block_copy_step = [&]() { return make_multi_index(L0PerBlock, 0, 0); }();
+        return b1_block_copy_step;
+    }
+
+    template <typename ABlockDesc_>
+    __host__ __device__ static constexpr auto MakeAWaveDescriptor(const ABlockDesc_&)
+    {
+        constexpr auto a_wave_desc = [&]() {
+            // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
+            constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0);
+            constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
+#ifdef __gfx12__
+            constexpr auto A_KRow = I2;
+#else
+            constexpr auto A_KRow = I1;
+#endif
+            return transform_tensor_descriptor(
+                ABlockDesc_{},
+                make_tuple(make_unmerge_transform(make_tuple(Number<A_K0 / A_KRow>{}, A_KRow)),
+                           make_unmerge_transform(
+                               make_tuple(Number<MRepeat>{}, Number<MWaves>{}, Number<MPerWmma>{})),
+                           make_pass_through_transform(Number<A_K1>{})),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
+        }();
+
+        return a_wave_desc;
+    }
+
+    template <typename B0BlockDesc_>
+    __host__ __device__ static constexpr auto MakeB0WaveDescriptor(const B0BlockDesc_&)
+    {
+        constexpr auto b0_wave_desc = [&]() {
+            // BK0_L_BK1 -> BK0_LRepeat_Lwaves_BKRow_LPerWmma_BK1
+            constexpr auto B_K0 = B0BlockDesc_{}.GetLength(I0);
+            constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I2);
+#ifdef __gfx12__
+            constexpr auto B_KRow = I2;
+#else
+            constexpr auto B_KRow = I1;
+#endif
+            return transform_tensor_descriptor(
+                B0BlockDesc_{},
+                make_tuple(make_unmerge_transform(make_tuple(Number<B_K0 / B_KRow>{}, B_KRow)),
+                           make_unmerge_transform(
+                               make_tuple(Number<LRepeat>{}, Number<LWaves>{}, Number<LPerWmma>{})),
+                           make_pass_through_transform(Number<B_K1>{})),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
+        }();
+
+        return b0_wave_desc;
+    }
+
+    template <typename A1BlockDesc_AL0_M_AL1>
+    __host__ __device__ static constexpr auto
+    MakeA1WaveDescriptor_L0_M0_M1_M2_L1(const A1BlockDesc_AL0_M_AL1&)
+    {
+        constexpr index_t A_L0 = A1BlockDesc_AL0_M_AL1{}.GetLength(I0);
+        constexpr index_t A_L1 = A1BlockDesc_AL0_M_AL1{}.GetLength(I2);
+        constexpr auto A_LRow  = I1;
+        return transform_tensor_descriptor(
+            A1BlockDesc_AL0_M_AL1{},
+            make_tuple(make_unmerge_transform(make_tuple(Number<A_L0>{}, A_LRow)),
+                       make_unmerge_transform(make_tuple(Number<MRepeat>{}, I1, I1)),
+                       make_pass_through_transform(Number<A_L1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
+    }
+
+    template <typename B1BlockDesc_>
+    __host__ __device__ static constexpr auto MakeB1WaveDescriptor(const B1BlockDesc_&)
+    {
+
+        constexpr auto b1_wave_desc = [&]() {
+            // BL0_N_BL1 -> BL0_NRepeat_Nwaves_NPerWmma_BL1
+            constexpr auto B_L0 = B1BlockDesc_{}.GetLength(I0);
+            constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I2);
+#ifdef __gfx12__
+            constexpr auto B_LRow = I2;
+#else
+            constexpr auto B_LRow = I1;
+#endif
+            return transform_tensor_descriptor(
+                B1BlockDesc_{},
+                make_tuple(make_unmerge_transform(make_tuple(Number<B_L0 / B_LRow>{}, B_LRow)),
+                           make_unmerge_transform(
+                               make_tuple(Number<NRepeat>{}, Number<NWaves>{}, Number<NPerWmma>{})),
+                           make_pass_through_transform(Number<B_L1>{})),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
+        }();
+
+        return b1_wave_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    // *Caution Here repeat is shuffle repeat
+    GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
+    {
+        constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMRepeatPerShuffle * MWaves * MPerWmma>{},
+                           I1,
+                           Number<CShuffleNRepeatPerShuffle * NWaves * NPerWmma>{}));
+
+        return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        const index_t gemm0_bytes_end =
+            (SharedMemTrait::a_block_space_size_aligned * sizeof(ADataType) +
+             SharedMemTrait::b0_block_space_size_aligned * sizeof(B0DataType));
+
+        const index_t gemm1_bytes_end =
+            (SharedMemTrait::b1_block_space_offset +
+             SharedMemTrait::b1_block_space_size_aligned * sizeof(B1DataType));
+
+        const index_t acc0_bytes_end =
+            SharedMemTrait::reduction_space_offset +
+            SharedMemTrait::reduction_space_size_aligned * sizeof(Acc0DataType);
+
+        const index_t c_block_bytes_end =
+            SharedMemTrait::c_block_space_size * sizeof(CShuffleDataType);
+
+        return math::max(gemm0_bytes_end, gemm1_bytes_end, acc0_bytes_end, c_block_bytes_end);
+    }
+
+    // Blockwise gemm pipeline for gemm0, this replaces the old GridwiseGemmPipe +
+    // BlockwiseGemmWMMA. The latter had two enableLDS bools which we don't
+    // have anymore, the new pipelines ALWAYS use lds. Furthermore the original BlockwiseGemmWMMA
+    // used TransposeC = true which we still need to make the operation work.
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmPipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ADataType,
+                                B0DataType,
+                                // TODO: Check if these compute types should always be
+                                //  equal to data type.
+                                ADataType,  // ComputeTypeA
+                                B0DataType, // ComputeTypeB
+                                Acc0DataType,
+                                decltype(MakeAWaveDescriptor(MakeABlockDescriptor())),
+                                decltype(MakeB0WaveDescriptor(MakeB0BlockDescriptor())),
+                                ABlockTransferSrcScalarPerVector,
+                                B0BlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                LPerBlock,
+                                KPerBlock,
+                                MPerWmma,
+                                LPerWmma,
+                                MRepeat,
+                                LRepeat,
+                                KPack,
+                                true>())>; // TransposeC (must be true to work), C' = B' x A'
+
+    // block_id to matrix tile idx (m0, n0) mapping is controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc,
+                                                            const B0GridDesc& b0_grid_desc,
+                                                            const B1GridDesc& b1_grid_desc,
+                                                            const CGridDesc_M_N& c_grid_desc_m_n,
+                                                            const Block2CTileMap& block_2_ctile_map)
+    {
+        // Print lambda with env check and printf() style formmating.
+        const char* curFunc = __func__;
+        auto print          = [&curFunc](const char* format, ...) -> void {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wformat-nonliteral"
+#endif
+                va_list args;
+                va_start(args, format);
+                std::vfprintf(stdout, format, args);
+                va_end(args);
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+                std::cout << "In file: " << __FILE__ << ", function: " << curFunc << "\n";
+            }
+        };
+
+        static_assert(MPerBlock % (MPerWmma * MRepeat) == 0 &&
+                          LPerBlock % (LPerWmma * LRepeat) == 0 &&
+                          NPerBlock % (NPerWmma * NRepeat) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc.GetLength(I1);
+        const auto L = b0_grid_desc.GetLength(I1);
+        const auto K = a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2);
+        const auto N = b1_grid_desc.GetLength(I1);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
+        {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                print("GridwiseOp: M/N Length err, A_M/N = %d, %d | C_M/N = %d, %d\n",
+                      M,
+                      N,
+                      c_grid_desc_m_n.GetLength(I0),
+                      c_grid_desc_m_n.GetLength(I1));
+            }
+            return false;
+        }
+
+        if(!(M % MPerBlock == 0 && L % LPerBlock == 0 && K % KPerBlock == 0 && N % NPerBlock == 0))
+        {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                print("GridwiseOp: M/L/K/N Division err, M/L/K/N = %d, %d, %d, %d | "
+                      "M/L/K/NPerBlock = "
+                      "%d, %d, %d, %d\n",
+                      M,
+                      L,
+                      K,
+                      N,
+                      MPerBlock,
+                      LPerBlock,
+                      KPerBlock,
+                      NPerBlock);
+            }
+            return false;
+        }
+
+        // check gemm1 gridwise gemm pipeline
+        if(!(LPerBlock % LTilePerBlock == 0))
+        {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                print("GridwiseOp: inner loop division, L/LTilePerblock: %d, %d\n",
+                      LPerBlock,
+                      LTilePerBlock);
+            }
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                print("GridwiseOp: invalid block_2_ctile_map\n");
+            }
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = math::integer_divide_ceil(K, KPerBlock);
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ __device__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = math::integer_divide_ceil(K, KPerBlock);
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock =
+        remove_cvref_t<decltype(MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            CGridDesc_M_N{}))>;
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;
+
+    struct SharedMemTrait
+    {
+        // LDS allocation for A and B: be careful of alignment
+        static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), BL1);
+
+        static constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            MakeABlockDescriptor().GetElementSpaceSize(), max_lds_align);
+        static constexpr auto b0_block_space_size_aligned = math::integer_least_multiple(
+            MakeB0BlockDescriptor().GetElementSpaceSize(), max_lds_align);
+        static constexpr auto b1_block_space_size_aligned = math::integer_least_multiple(
+            MakeB1BlockDescriptor().GetElementSpaceSize(), max_lds_align);
+
+        static constexpr auto a_block_space_offset  = 0;
+        static constexpr auto b0_block_space_offset = a_block_space_size_aligned;
+        static constexpr auto b1_block_space_offset = 0;
+
+        // LDS allocation for reduction
+        // Feature to add, IntraThread Reduction
+        static constexpr index_t reduction_space_size_aligned =
+            math::integer_least_multiple(BlockSize, max_lds_align);
+
+        static constexpr auto reduction_space_offset = 0;
+
+        // LDS allocation for C shuffle in LDS
+        static constexpr auto c_block_space_size =
+            GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
+                .GetElementSpaceSize();
+    };
+
+    template <bool HasMainKBlockLoop,
+              TailNumber TailNum,
+              typename Block2CTileMap = DefaultBlock2CTileMap>
+    __device__ static void Run(const ADataType* __restrict__ p_a_grid,
+                               const B0DataType* __restrict__ p_b0_grid,
+                               const B1DataType* __restrict__ p_b1_grid,
+                               CDataType* __restrict__ p_c_grid,
+                               void* __restrict__ p_shared,
+                               const AGridDesc& a_grid_desc,
+                               const B0GridDesc& b0_grid_desc,
+                               const B1GridDesc& b1_grid_desc,
+                               const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const AElementwiseOperation& a_element_op,
+                               const B0ElementwiseOperation& b0_element_op,
+                               const AccElementwiseOperation& acc_element_op,
+                               const B1ElementwiseOperation& b1_element_op,
+                               const CElementwiseOperation& c_element_op,
+                               const Block2CTileMap& block_2_ctile_map)
+    {
+        // clang-format off
+/*******************************************************************************/
+// Memory buffer zone.
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc.GetElementSpaceSize());
+        const auto b0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b0_grid, b0_grid_desc.GetElementSpaceSize());
+        const auto b1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b1_grid, b1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+/*******************************************************************************/
+// BlockIdx.x -> [BlockId.m, BlockId.n]
+        const auto block_work_idx = block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        { return; }
+
+        // Store BlockId into SGPR
+        const index_t m_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+        const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+/*******************************************************************************/
+// set up Gemm0
+/*******************************************************************************/
+
+/*******************************************************************************/
+// BlockLevel, A/B Matrix ThreadMapping in LDS, As Destination of BlockWise_Copy
+        constexpr auto a_block_desc  = MakeABlockDescriptor();
+        constexpr auto b0_block_desc = MakeB0BlockDescriptor();
+
+        auto a_block_trait = [&](){
+            // A matrix blockwise copy
+            constexpr auto AK0PerBlock = KPerBlock/ AK1;
+            auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<ADataType*>(p_shared) + SharedMemTrait::a_block_space_offset, 
+                SharedMemTrait::a_block_space_size_aligned);
+
+            auto a_blockwise_copy =
+                ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock,
+/* typename SrcElementwiseOperation,              */     AElementwiseOperation,
+/* typename DstElementwiseOperation,              */     ck::tensor_operation::element_wise::PassThrough,
+/* InMemoryDataOperationEnum DstInMemOp,          */     InMemoryDataOperationEnum::Set,
+/* typename BlockSliceLengths,                    */     Sequence<AK0PerBlock, MPerBlock, AK1>,
+/* typename ThreadClusterLengths,                 */     ABlockTransferThreadClusterLengths_K0_M_K1,
+/* typename ThreadClusterArrangeOrder,            */     ABlockTransferThreadClusterArrangeOrder,
+/* typename SrcData,                              */     ADataType,
+/* typename DstData,                              */     ADataType,
+/* typename SrcDesc,                              */     decltype(a_grid_desc),
+/* typename DstDesc,                              */     decltype(a_block_desc),
+/* typename SrcDimAccessOrder,                    */     ABlockTransferSrcAccessOrder,
+/* typename DstDimAccessOrder,                    */     Sequence<0, 1, 2>,
+/* index_t SrcVectorDim,                          */     ABlockTransferSrcVectorDim,
+/* index_t DstVectorDim,                          */     2,
+/* index_t SrcScalarPerVector,                    */     ABlockTransferSrcScalarPerVector,
+/* index_t DstScalarPerVector,                    */     ABlockTransferDstScalarPerVector_K1,
+/* index_t SrcScalarStrideInVector,               */     1,
+/* index_t DstScalarStrideInVector,               */     1,
+/* bool ThreadTransferSrcResetCoordinateAfterRun, */     AThreadTransferSrcResetCoordinateAfterRun,
+/* bool ThreadTransferDstResetCoordinateAfterRun, */     true,
+                                                        BlockwiseGemmPipe::GlobalBufferNum>( 
+            a_grid_desc,
+            make_multi_index(0, m_block_data_idx_on_grid, 0),
+            a_element_op,
+            a_block_desc,
+            make_multi_index(0, 0, 0),
+            ck::tensor_operation::element_wise::PassThrough{});
+
+            return make_tuple(a_block_buf, a_blockwise_copy);
+        };
+        
+        auto b0_block_trait = [&](){
+            auto b0_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<B0DataType*>(p_shared) + SharedMemTrait::b0_block_space_offset, 
+                SharedMemTrait::b0_block_space_size_aligned);
+
+            auto b0_blockwise_copy =
+                ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                            B0ElementwiseOperation,
+                                            ck::tensor_operation::element_wise::PassThrough,
+                                            InMemoryDataOperationEnum::Set,
+                                            Sequence<BK0, LPerBlock, BK1>,
+                                            B0BlockTransferThreadClusterLengths_K0_L_K1,
+                                            B0BlockTransferThreadClusterArrangeOrder,
+                                            B0DataType,
+                                            B0DataType,
+                                            decltype(b0_grid_desc),
+                                            decltype(b0_block_desc),
+                                            B0BlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2>,
+                                            B0BlockTransferSrcVectorDim,
+                                            2,
+                                            B0BlockTransferSrcScalarPerVector,
+                                            B0BlockTransferDstScalarPerVector_K1,
+                                            1,
+                                            1,
+                                            B0ThreadTransferSrcResetCoordinateAfterRun,
+                                            true,
+                                            BlockwiseGemmPipe::GlobalBufferNum>(
+            b0_grid_desc,
+            make_multi_index(0, 0, 0),
+            b0_element_op,
+            b0_block_desc,
+            make_multi_index(0, 0, 0),
+            ck::tensor_operation::element_wise::PassThrough{});
+            
+            return make_tuple(b0_block_buf, b0_blockwise_copy);
+        };
+
+        auto a_block_buf       = a_block_trait()[I0];
+        auto a_blockwise_copy  = a_block_trait()[I1];
+        
+        auto b0_block_buf       = b0_block_trait()[I0];
+        auto b0_blockwise_copy  = b0_block_trait()[I1];
+
+/*******************************************************************************/
+        // Gemm0
+        // Blockwise GEMM0 pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm0_pipeline = BlockwiseGemmPipe{};
+        auto acc0_thread_buf          = blockwise_gemm0_pipeline.GetCThreadBuffer();
+
+        // Note that we are using the transposeC version of GetCThreadDescriptor.
+        constexpr auto acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = 
+            blockwise_gemm0_pipeline.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs();
+        
+        constexpr auto mrepeat            = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I0);
+        constexpr auto mwave              = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I1);
+        constexpr auto mthreadpersubgroup = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I2);
+        constexpr auto lrepeat            = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I3);
+        constexpr auto lwave              = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I4);
+        constexpr auto lsubgroup          = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I5);
+        constexpr auto laccvgprs          = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I6);
+
+        constexpr auto acc0_thread_desc_l0perblock_mperblock_l1 = transform_tensor_descriptor(
+            acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs,
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(lrepeat, lwave, lsubgroup)),
+                       make_merge_transform_v3_division_mod(make_tuple(mrepeat, mwave, mthreadpersubgroup)),
+                       make_pass_through_transform(laccvgprs)),
+            make_tuple(Sequence<3, 4, 5>{}, Sequence<0, 1, 2>{}, Sequence<6>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+/*******************************************************************************/        
+        // Shift Per SUB_K
+        constexpr auto a_block_slice_copy_step = MakeABlockSliceCopyStep();
+        constexpr auto b0_block_slice_copy_step = MakeB0BlockSliceCopyStep();
+
+        const auto a_block_reset_copy_step = [&](){
+            return make_multi_index(-a_grid_desc.GetLength(I0), 0, 0);
+        }();
+
+        const auto b0_block_reset_copy_step = [&](){
+            return make_multi_index(-b0_grid_desc.GetLength(I0), LPerBlock, 0);
+        }();
+
+        const auto K = [&](){
+            return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2);
+        }();
+
+        const index_t KBlockMainLoop = __builtin_amdgcn_readfirstlane(K / KPerBlock);
+        
+/*******************************************************************************/
+// set up Gemm1
+/*******************************************************************************/
+        // Acc0 thread buffer -> A1 thread buffer -> blockwise gemm
+        // A1 matrix in VGPR
+        constexpr auto A1ThreadSlice_L0PerBlock_MPerBlock_L1 = make_tuple(
+            Number<AL0 * AL1 / laccvgprs>{}, 
+            Number<mrepeat * mwave * mthreadpersubgroup>{}, 
+            Number<laccvgprs>{});
+
+        constexpr auto A1ThreadSliceL0PerBlock  = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I0];
+        constexpr auto A1ThreadSliceMPerBlock   = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I1];
+        constexpr auto A1ThreadSliceL1          = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I2];
+
+        constexpr auto a1_thread_desc_l0perblock_mperblock_l1 = make_naive_tensor_descriptor(
+            make_tuple(A1ThreadSliceL0PerBlock, A1ThreadSliceMPerBlock, A1ThreadSliceL1),
+            make_tuple(A1ThreadSliceMPerBlock * A1ThreadSliceL1, A1ThreadSliceL1, I1));
+
+        // A1 matrix blockwise copy
+        auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic<
+            Acc0DataType,
+            ADataType,
+            decltype(acc0_thread_desc_l0perblock_mperblock_l1),
+            decltype(a1_thread_desc_l0perblock_mperblock_l1),
+            tensor_operation::element_wise::PassThrough,
+            Sequence<A1ThreadSliceL0PerBlock, A1ThreadSliceMPerBlock, A1ThreadSliceL1>,
+            Sequence<0, 1, 2>,
+            2,
+            laccvgprs>{tensor_operation::element_wise::PassThrough{}};
+   
+        auto a1_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ADataType>(
+            a1_thread_desc_l0perblock_mperblock_l1.GetElementSpaceSize());       
+            
+        constexpr auto b1_block_desc = MakeB1BlockDescriptor();
+
+        auto b1_block_trait = [&](){
+            auto b1_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<B1DataType*>(p_shared) + SharedMemTrait::b1_block_space_offset, 
+                SharedMemTrait::b1_block_space_size_aligned);
+
+            auto b1_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1<
+                                                    ThisThreadBlock,
+/* typename SrcElementwiseOperation,              */ B1ElementwiseOperation,
+/* typename DstElementwiseOperation,              */ tensor_operation::element_wise::PassThrough,
+/* InMemoryDataOperationEnum DstInMemOp,          */ InMemoryDataOperationEnum::Set,
+/* typename BlockSliceLengths,                    */ Sequence<BL0, NPerBlock, BL1>,
+/* typename ThreadClusterLengths,                 */ B1BlockTransferThreadClusterLengths_L0_N_L1,
+/* typename ThreadClusterArrangeOrder,            */ B1BlockTransferThreadClusterArrangeOrder,
+/* typename SrcData,                              */ B1DataType,
+/* typename DstData,                              */ B1DataType,
+/* typename SrcDesc,                              */ decltype(b1_grid_desc),
+/* typename DstDesc,                              */ decltype(b1_block_desc),
+/* typename SrcDimAccessOrder,                    */ B1BlockTransferSrcAccessOrder,
+/* typename DstDimAccessOrder,                    */ Sequence<1, 0, 2>,
+/* index_t SrcVectorDim,                          */ B1BlockTransferSrcVectorDim,
+/* index_t DstVectorDim,                          */ 2,
+/* index_t SrcScalarPerVector,                    */ B1BlockTransferSrcScalarPerVector,
+/* index_t DstScalarPerVector,                    */ B1BlockTransferDstScalarPerVector_L1,
+/* index_t SrcScalarStrideInVector,               */ 1,
+/* index_t DstScalarStrideInVector,               */ 1,
+/* bool ThreadTransferSrcResetCoordinateAfterRun, */ B1ThreadTransferSrcResetCoordinateAfterRun,
+/* bool ThreadTransferDstResetCoordinateAfterRun, */ true, // DstResetCoord
+                                                     1>( // Used to be NumGemmKPrefetchStage, never tested / used for != 1
+                b1_grid_desc,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b1_element_op,
+                b1_block_desc,
+                make_multi_index(0, 0, 0),
+                tensor_operation::element_wise::PassThrough{});
+                    
+            return make_tuple(b1_block_buf, b1_blockwise_copy);
+        };
+
+        auto b1_block_buf       = b1_block_trait()[I0];
+        auto b1_blockwise_copy  = b1_block_trait()[I1];
+
+        constexpr auto b1_block_slice_copy_step = MakeB1BlockSliceCopyStep();
+
+        auto blockwise_gemm1 =
+            BlockwiseGemmWMMA<BlockSize,
+                              ADataType,
+                              B1DataType,
+                              Acc1DataType,
+                              decltype(MakeA1WaveDescriptor_L0_M0_M1_M2_L1(a1_thread_desc_l0perblock_mperblock_l1)),
+                              decltype(MakeB1WaveDescriptor(b1_block_desc)),
+                              MPerBlock,
+                              NPerBlock,
+                              LTilePerBlock,
+                              MPerWmma,
+                              NPerWmma,
+                              MRepeat,
+                              NRepeat,
+                              KPack,
+                              false, // Acc1EnableLds
+                              true,  // B1EnableLds
+                              true>{make_tuple(0, 0, 0, 0, 0, 0)};
+
+        auto acc1_thread_buf = blockwise_gemm1.GetCThreadBuffer();
+
+        const auto L = [&](){
+            return b0_grid_desc.GetLength(I1);
+        }();
+
+        const index_t num_gemm1_l_block_outer_loop = L / LPerBlock;
+        constexpr index_t num_gemm1_l_block_inner_loop = LPerBlock / LTilePerBlock;
+
+        // Initialize C
+        StaticBuffer<AddressSpaceEnum::Vgpr, Acc1DataType, acc1_thread_buf.Size(), true> c_thread_buf;
+        c_thread_buf.Clear();
+
+        // Empty BScale struct for the blockwise pipeline.
+        using BScale        = typename BlockwiseGemmPipe::Empty;
+        auto b_scale_struct = BScale{};
+
+/*******************************************************************************/
+        // 
+        // Kernel Main Stage
+        //
+        index_t gemm1_l_block_outer_index = 0;
+        // Outer loop, along GEMM_L
+        // Inner loop, along GEMM_K
+        do {
+            blockwise_gemm0_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc, 
+                                                                              a_block_desc, 
+                                                                              a_blockwise_copy, 
+                                                                              a_grid_buf,
+                                                                              a_block_buf, 
+                                                                              a_block_slice_copy_step,
+                                                                              b0_grid_desc,
+                                                                              b0_block_desc,
+                                                                              b0_blockwise_copy,
+                                                                              b0_grid_buf,
+                                                                              b0_block_buf,
+                                                                              b0_block_slice_copy_step,
+                                                                              acc0_thread_buf,
+                                                                              b_scale_struct,
+                                                                              KBlockMainLoop,
+                                                                              1); // num_k_block_per_scale
+
+            static_for<0, acc0_thread_buf.Size(), 1>{}(
+                    [&](auto i) { acc_element_op(acc0_thread_buf(i), acc0_thread_buf[i]); });
+
+            block_sync_lds();
+
+            // gemm1
+            {
+                // TODO: explore using dynamic buffer for a1 thread buffer
+                // For a1_blockwise_copy, the goal is to satisfy pipeline requirements RunRead(),
+                // RunWrite(), and MoveSliceWindow(). But it is impossible to implement given that
+                // the A1 source buffer is static buffer holding the output of first GEMM and
+                // requires constexpr offset by design. Therefore, we pass tensor coordinate offset
+                // explicitly in Run() below.
+
+                // Initialize acc1
+                acc1_thread_buf.Clear();
+
+                // preload data into LDS
+                b1_blockwise_copy.RunRead(b1_grid_desc, b1_grid_buf);
+
+                b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc,
+                                                     b1_block_slice_copy_step);
+
+                block_sync_lds(); // wait for reduction LDS read
+
+                b1_blockwise_copy.RunWrite(b1_block_desc, b1_block_buf);
+
+                // main body
+                if constexpr(num_gemm1_l_block_inner_loop > 1)
+                {
+                    static_for<0, num_gemm1_l_block_inner_loop - 1, 1>{}([&](auto i) {
+                        // Data cast from Acc0DataType to ADataType happens here
+                        a1_blockwise_copy.Run(acc0_thread_desc_l0perblock_mperblock_l1,
+                                              make_tuple(Number<i * A1ThreadSliceL0PerBlock>{}, I0, I0),
+                                              acc0_thread_buf,
+                                              a1_thread_desc_l0perblock_mperblock_l1,
+                                              make_tuple(I0, I0, I0),
+                                              a1_thread_buf);
+
+                        b1_blockwise_copy.RunRead(b1_grid_desc, b1_grid_buf);
+
+                        block_sync_lds();
+
+                        blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf);
+
+                        block_sync_lds();
+
+                        b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc,
+                                                             b1_block_slice_copy_step);
+
+                        b1_blockwise_copy.RunWrite(b1_block_desc, b1_block_buf);
+                    });
+                }
+                // tail
+                {
+                    a1_blockwise_copy.Run(
+                        acc0_thread_desc_l0perblock_mperblock_l1,
+                        make_tuple(
+                            Number<(num_gemm1_l_block_inner_loop - 1) * A1ThreadSliceL0PerBlock>{}, I0, I0),
+                        acc0_thread_buf,
+                        a1_thread_desc_l0perblock_mperblock_l1,
+                        make_tuple(I0, I0, I0),
+                        a1_thread_buf);
+
+                    block_sync_lds();
+            
+                    blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf);
+                }
+            } // end gemm1
+
+            constexpr auto c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs =
+                blockwise_gemm1.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs();
+            constexpr auto c_mrepeat            = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I0);
+            constexpr auto c_mwave              = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I1);
+            constexpr auto c_mthreadpersubgroup = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I2);
+            constexpr auto c_nrepeat            = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I3);
+            constexpr auto c_nwave              = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I4);
+            constexpr auto c_nsubgroup          = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I5);
+            constexpr auto c_naccvgprs          = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I6);
+            
+            constexpr auto c_thread_slice_desc_m_n = make_naive_tensor_descriptor_packed(
+                make_tuple(c_mrepeat * c_mwave * c_mthreadpersubgroup, 
+                           c_nrepeat * c_nwave * c_nsubgroup * c_naccvgprs));
+            constexpr auto c_thread_buf_slice_m = c_thread_slice_desc_m_n.GetLength(I0);
+            constexpr auto c_thread_buf_slice_n = c_thread_slice_desc_m_n.GetLength(I1);
+
+            static_for<0, c_thread_buf_slice_m, 1>{}([&](auto iM) {
+                static_for<0, c_thread_buf_slice_n, 1>{}([&](auto iN) {
+                    auto I = Number<c_thread_slice_desc_m_n.CalculateOffset(make_tuple(iM, iN))>{};
+                    Acc1DataType acc1  = acc1_thread_buf[I]; // P*V
+                    Acc1DataType c     = c_thread_buf[I];    // O
+                    Acc1DataType c_new = c + acc1; // Simply add results since we are no longer using softmax.
+
+                    c_thread_buf(I) = c_new; // O_new
+                });
+            });
+
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc,
+                                                a_block_reset_copy_step); // rewind K
+            b0_blockwise_copy.MoveSrcSliceWindow(b0_grid_desc,
+                                                b0_block_reset_copy_step); // rewind K and step N
+
+            block_sync_lds(); // wait for gemm1 LDS read
+        }while(++gemm1_l_block_outer_index < num_gemm1_l_block_outer_loop);
+/*******************************************************************************/
+        // write out to C, implement shuffle
+        {
+            constexpr auto c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs =  
+            blockwise_gemm1.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs();
+
+            // This API Provide All dimension (size) you need
+            constexpr auto c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp =
+                blockwise_gemm1.GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs();
+
+            constexpr auto MWave              = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I1);
+            constexpr auto MThreadPerSubGroup = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I2);
+            constexpr auto NWave              = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I4);
+            constexpr auto NSubGroup          = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I5);
+            constexpr auto NAccVgprs          = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I6);
+
+            // LDS descriptor, shuffle and write out in MRepeat x NRepeat times
+            constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+                GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = transform_tensor_descriptor(
+                c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMRepeatPerShuffle>{}, // MRepeat per shuffle repeat
+                        MWave,                               // MWave
+                        MThreadPerSubGroup                   // MThreadPerSubGroup = MPerWmma
+                        )),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNRepeatPerShuffle>{}, // NRepeat per shuffle repeat
+                        NWave,                               // NWave
+                        NSubGroup,
+                        NAccVgprs))),                        // NSubGroup * NAccVgprs = NPerWmma
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{}, Sequence<0, 1, 2>{}, Sequence<>{}, Sequence<3, 4, 5, 6>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block = blockwise_gemm1.CalculateCThreadOriginDataIndex(I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_mrepeat_mwave_mthreadpersubgroup_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MThreadPerSubGroup))),
+                make_tuple(Sequence<0, 1, 2>{}),
+                make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_to_nrepeat_nwave_nsubgroup_naccvgprs_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NSubGroup, NAccVgprs))),
+                make_tuple(Sequence<0, 1, 2, 3>{}),
+                make_tuple(Sequence<0>{}));
+            
+            const auto m_thread_data_on_block_idx = m_thread_data_on_block_to_mrepeat_mwave_mthreadpersubgroup_adaptor.CalculateBottomIndex(
+                make_multi_index(m_thread_data_on_block));
+            
+            const auto n_thread_data_on_block_idx = n_thread_data_on_block_to_nrepeat_nwave_nsubgroup_naccvgprs_adaptor.CalculateBottomIndex(
+                make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<Acc1DataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs),
+                                                   decltype(c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMRepeatPerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            CShuffleNRepeatPerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            NAccVgprs>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                                   6,
+                                                   8, // vector write pixel
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs,
+                    make_multi_index(0,
+                                     m_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     0,
+                                     n_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I3]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMRepeatPerShuffle * MWave * MPerWmma,
+                         1,
+                         CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,        // typename SrcData,
+                CDataType,               // typename DstData,
+                decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 c_element_op};
+
+            // space filling curve for local reg & global memory
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MRepeat, 1, 1, NRepeat, 1, 1, NAccVgprs>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                  Sequence<CShuffleMRepeatPerShuffle,
+                                           1,
+                                           1,
+                                           CShuffleNRepeatPerShuffle,
+                                           1,
+                                           1,
+                                           NAccVgprs>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMRepeatPerShuffle * MWave * MPerWmma,
+                                           1,
+                                           CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+        // clang-format on
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
index 258d0ad0ca..a15f11a93f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -195,6 +195,28 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
         return math::max(gemm0_bytes_end, gemm1_bytes_end, c_block_bytes_end);
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        constexpr bool valid = ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+        if constexpr(!valid)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
@@ -549,16 +571,30 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
         constexpr auto A1ThreadSlice_K0_M_K1 =
             make_tuple(Number<Gemm1KPerBlock / n4 / AccN3>{}, Number<m0 * m1 * m2>{}, Number<n4>{});
 
-        constexpr auto A1ThreadSliceK0        = A1ThreadSlice_K0_M_K1[I0];
-        constexpr auto A1ThreadSliceM         = A1ThreadSlice_K0_M_K1[I1];
-        constexpr auto A1ThreadSliceK1        = A1ThreadSlice_K0_M_K1[I2];
+        constexpr auto A1ThreadSliceK0 = A1ThreadSlice_K0_M_K1[I0];
+        constexpr auto A1ThreadSliceM  = A1ThreadSlice_K0_M_K1[I1];
+        constexpr auto A1ThreadSliceK1 = A1ThreadSlice_K0_M_K1[I2];
+
+#if defined(__gfx11__)
+        constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor_packed(
+            make_tuple(A1ThreadSliceK0, A1ThreadSliceM, Number<A1ThreadSliceK1 * 2>{}));
+        auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow<
+            FloatGemmAcc,
+            FloatAB,
+            decltype(acc_thread_desc_k0_m_k1),
+            decltype(a1_thread_desc_k0_m_k1),
+            decltype(acc_element_op),
+            Sequence<A1ThreadSliceK0, A1ThreadSliceM, A1ThreadSliceK1>,
+            Sequence<1, 0, 2>,
+            2,
+            n4,
+            0x76543210,
+            0xfedcba98,
+            true>{make_tuple(0, 0, 0)};
+#else
         constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor(
             A1ThreadSlice_K0_M_K1,
             make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1));
-
-        // B1 matrix in LDS memory, dst of blockwise copy
-        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-
         // A1 matrix blockwise copy
         auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic<
             FloatGemmAcc,
@@ -570,6 +606,9 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
             Sequence<1, 0, 2>,
             2,
             n4>{acc_element_op};
+#endif
+        // B1 matrix in LDS memory, dst of blockwise copy
+        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
 
         // B1 matrix blockwise copy
         auto b1_blockwise_copy =
@@ -619,10 +658,15 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
         // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
         // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
         // therefore we may just as well assign Gemm1KPack = group_size
-
+#if defined(__gfx11__)
+        constexpr index_t Gemm1KPack =
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size * 2;
+        static_assert(
+            XdlopsGemm<FloatAB, MPerXdl, NPerXdl, Gemm1KPack, FloatAB, false>{}.K0PerXdlops == 1);
+#else
         constexpr index_t Gemm1KPack =
             MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size;
-
+#endif
         auto gemm1_blockwise_gemm = BlockwiseGemmXdlops_v2<
             BlockSize,
             FloatAB,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
index e8f8caa10d..b8f5a545aa 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -99,7 +99,6 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
     static constexpr auto I6 = Number<6>{};
     static constexpr auto I7 = Number<7>{};
 
-    static constexpr auto WaveSize = 64;
     // K1 should be Number<...>
     // Gemm0
     static constexpr auto A0K1 = Number<A0K1Value>{};
@@ -110,6 +109,7 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
 
     static constexpr auto Gemm0MWaves = Gemm0MPerBlock / (Gemm0MPerXdl * Gemm0MXdlPerWave);
     static constexpr auto Gemm0NWaves = Gemm0NPerBlock / (Gemm0NPerXdl * Gemm0NXdlPerWave);
+    static constexpr auto WaveSize    = BlockSize / (Gemm0MWaves * Gemm0NWaves);
     // Gemm1
     static constexpr auto B1K1         = Number<B1K1Value>{};
     static constexpr auto B1K0PerBlock = Number<Gemm1KPerBlock / B1K1Value>{};
@@ -255,18 +255,63 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         return math::max(gemm0_bytes_end, gemm1_bytes_end, c1_block_bytes_end);
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+                   BlockSize,
+                   Gemm0MPerBlock,
+                   Gemm0NPerBlock,
+                   Gemm0MPerXdl,
+                   Gemm0NPerXdl,
+                   Gemm0MXdlPerWave,
+                   Gemm0NXdlPerWave,
+                   E1DataType,
+                   CGlobalMemoryDataOperation_>() &&
+               ck::tensor_operation::device::IsValidGemmCompilationParameter<
+                   BlockSize,
+                   Gemm0MPerBlock,
+                   Gemm1NPerBlock,
+                   Gemm0MPerXdl,
+                   Gemm0NPerXdl,
+                   Gemm0MXdlPerWave,
+                   Gemm1NXdlPerWave,
+                   E1DataType,
+                   CGlobalMemoryDataOperation_>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2E1TileMap>
-    __host__ __device__ static constexpr bool
-    CheckValidity(const A0GridDesc_M_K& a0_grid_desc_m_k,
-                  const B0GridDesc_N_K& b0_grid_desc_n_k,
-                  const B1GridDesc_N_K& b1_grid_desc_n_k,
-                  const E1GridDesc_M_N& e1_grid_desc_m_n,
-                  const Block2E1TileMap& block_2_e1tile_map)
+    __host__ static constexpr bool CheckValidity(const A0GridDesc_M_K& a0_grid_desc_m_k,
+                                                 const B0GridDesc_N_K& b0_grid_desc_n_k,
+                                                 const B1GridDesc_N_K& b1_grid_desc_n_k,
+                                                 const E1GridDesc_M_N& e1_grid_desc_m_n,
+                                                 const Block2E1TileMap& block_2_e1tile_map)
     {
         static_assert((Gemm0MPerBlock % (Gemm0MPerXdl * Gemm0MXdlPerWave) == 0) &&
                           (Gemm0NPerBlock % (Gemm0NXdlPerWave * Gemm0NPerXdl)) == 0,
                       "Invalid tuning param!");
+        if constexpr((Gemm0MPerXdl * Gemm0MXdlPerWave) == 0 ||
+                     (Gemm0NXdlPerWave * Gemm0NPerXdl) == 0)
+        {
+            return false;
+        }
+        else
+        {
+            if constexpr((Gemm0MPerBlock % (Gemm0MPerXdl * Gemm0MXdlPerWave) != 0) ||
+                         (Gemm0NPerBlock % (Gemm0NXdlPerWave * Gemm0NPerXdl) != 0))
+            {
+                return false;
+            }
+            else
+            {
+                if(WaveSize != get_warp_size())
+                {
+                    return false;
+                }
+            }
+        }
 
         const auto M      = a0_grid_desc_m_k.GetLength(I0);
         const auto N      = b0_grid_desc_n_k.GetLength(I0);
@@ -527,8 +572,7 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                                const CDE1ElementwiseOperation& cde1_element_op,
                                const A0GridDesc_AK0_M_AK1& a0_grid_desc_ak0_m_ak1,
                                const B0GridDesc_BK0_N_BK1& b0_grid_desc_bk0_n_bk1,
-                               const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5&
-                                   d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                               const D0sGridDesc_M_N& d0s_griddesc_m_n,
                                const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1,
                                const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
                                    d1s_grid_desc_mblock_mperblock_nblock_nperblock,
@@ -536,6 +580,8 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                                    e1_grid_desc_mblock_mperblock_nblock_nperblock,
                                const Block2E1TileMap& block_2_e1tile_map)
     {
+        const auto d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5 =
+            MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(d0s_griddesc_m_n);
         const auto a0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a0_grid, a0_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -824,16 +870,30 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         constexpr auto A1ThreadSlice_K0_M_K1 = make_tuple(
             Number<Gemm1KPerBlock / n4 / Acc0N3>{}, Number<m0 * m1 * m2>{}, Number<n4>{});
 
-        constexpr auto A1ThreadSliceK0        = A1ThreadSlice_K0_M_K1[I0];
-        constexpr auto A1ThreadSliceM         = A1ThreadSlice_K0_M_K1[I1];
-        constexpr auto A1ThreadSliceK1        = A1ThreadSlice_K0_M_K1[I2];
+        constexpr auto A1ThreadSliceK0 = A1ThreadSlice_K0_M_K1[I0];
+        constexpr auto A1ThreadSliceM  = A1ThreadSlice_K0_M_K1[I1];
+        constexpr auto A1ThreadSliceK1 = A1ThreadSlice_K0_M_K1[I2];
+#if defined(__gfx11__)
+        constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor_packed(
+            make_tuple(A1ThreadSliceK0, A1ThreadSliceM, Number<A1ThreadSliceK1 * 2>{}));
+        auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow<
+            Acc0DataType,
+            A0B0B1DataType,
+            decltype(acc0_thread_desc_k0_m_k1),
+            decltype(a1_thread_desc_k0_m_k1),
+            tensor_operation::element_wise::PassThrough,
+            Sequence<A1ThreadSliceK0, A1ThreadSliceM, A1ThreadSliceK1>,
+            Sequence<1, 0, 2>,
+            2,
+            n4,
+            0x76543210,
+            0xfedcba98,
+            true>{make_tuple(0, 0, 0)};
+#else
         constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor(
             A1ThreadSlice_K0_M_K1,
             make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1));
 
-        // B1 matrix in LDS memory, dst of blockwise copy
-        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-
         // A1 matrix blockwise copy
         auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic<
             Acc0DataType,
@@ -845,7 +905,10 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
             Sequence<1, 0, 2>,
             2,
             n4>{tensor_operation::element_wise::PassThrough{}};
+#endif
 
+        // B1 matrix in LDS memory, dst of blockwise copy
+        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
         // B1 matrix blockwise copy
         auto b1_blockwise_copy =
             ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
@@ -893,10 +956,13 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
         // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
         // therefore we may just as well assign Gemm1KPack = group_size
-
+#if defined(__gfx11__)
+        constexpr index_t Gemm1KPack =
+            MfmaSelector<A0B0B1DataType, Gemm0MPerXdl, Gemm0NPerXdl>::selected_mfma.group_size * 2;
+#else
         constexpr index_t Gemm1KPack =
             MfmaSelector<A0B0B1DataType, Gemm0MPerXdl, Gemm0NPerXdl>::selected_mfma.group_size;
-
+#endif
         auto blockwise_gemm1 = BlockwiseGemmXdlops_v2<
             BlockSize,
             A0B0B1DataType,
@@ -987,7 +1053,7 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
             else
             {
                 static_for<0, acc0_thread_buf.Size(), 1>{}(
-                    [&](auto i) { cde0_element_op(acc_thread_buf(i), acc0_thread_buf[i]); });
+                    [&](auto i) { cde0_element_op(acc0_thread_buf(i), acc0_thread_buf[i]); });
             }
             // gemm1
             {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
index 0f2085525f..0e8d003071 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -210,6 +210,22 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
         return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end);
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
@@ -445,11 +461,12 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
                                const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1,
                                const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
                                    c_grid_desc_mblock_mperblock_nblock_nperblock,
-                               const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5&
-                                   d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                               const D0sGridDesc_M_N& d0s_griddesc_m_n,
                                const Block2CTileMap& block_2_ctile_map,
                                const C0MatrixMask& c0_matrix_mask)
     {
+        const auto d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5 =
+            MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(d0s_griddesc_m_n);
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -727,16 +744,30 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
         constexpr auto A1ThreadSlice_K0_M_K1 =
             make_tuple(Number<Gemm1KPerBlock / n4 / AccN3>{}, Number<m0 * m1 * m2>{}, Number<n4>{});
 
-        constexpr auto A1ThreadSliceK0        = A1ThreadSlice_K0_M_K1[I0];
-        constexpr auto A1ThreadSliceM         = A1ThreadSlice_K0_M_K1[I1];
-        constexpr auto A1ThreadSliceK1        = A1ThreadSlice_K0_M_K1[I2];
+        constexpr auto A1ThreadSliceK0 = A1ThreadSlice_K0_M_K1[I0];
+        constexpr auto A1ThreadSliceM  = A1ThreadSlice_K0_M_K1[I1];
+        constexpr auto A1ThreadSliceK1 = A1ThreadSlice_K0_M_K1[I2];
+#if defined(__gfx11__)
+        constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor_packed(
+            make_tuple(A1ThreadSliceK0, A1ThreadSliceM, Number<A1ThreadSliceK1 * 2>{}));
+        auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow<
+            FloatGemmAcc,
+            FloatAB,
+            decltype(acc_thread_desc_k0_m_k1),
+            decltype(a1_thread_desc_k0_m_k1),
+            tensor_operation::element_wise::PassThrough,
+            Sequence<A1ThreadSliceK0, A1ThreadSliceM, A1ThreadSliceK1>,
+            Sequence<1, 0, 2>,
+            2,
+            n4,
+            0x76543210,
+            0xfedcba98,
+            false>{make_tuple(0, 0, 0)};
+#else
         constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor(
             A1ThreadSlice_K0_M_K1,
             make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1));
 
-        // B1 matrix in LDS memory, dst of blockwise copy
-        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-
         // A1 matrix blockwise copy
         auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic<
             FloatGemmAcc,
@@ -748,7 +779,9 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
             Sequence<1, 0, 2>,
             2,
             n4>{tensor_operation::element_wise::PassThrough{}};
-
+#endif
+        // B1 matrix in LDS memory, dst of blockwise copy
+        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
         // B1 matrix blockwise copy
         auto b1_blockwise_copy =
             ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
@@ -797,9 +830,13 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
         // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
         // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
         // therefore we may just as well assign Gemm1KPack = group_size
-
+#if defined(__gfx11__)
+        constexpr index_t Gemm1KPack =
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size * 2;
+#else
         constexpr index_t Gemm1KPack =
             MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size;
+#endif
 
         auto gemm1_blockwise_gemm = BlockwiseGemmXdlops_v2<
             BlockSize,
@@ -873,8 +910,8 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
                                                   decltype(thread_slice_desc_m_n)>{};
 
         const index_t num_gemm1_k_block_outer_loop =
-            b_grid_desc_bk0_n_bk1.GetLength(I1) / NPerBlock;
-        constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm1KPerBlock;
+            b_grid_desc_bk0_n_bk1.GetLength(I1) / (NPerBlock / Gemm0NWaves);
+        constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm0NWaves / Gemm1KPerBlock;
 
         // Initialize C
         StaticBuffer<AddressSpaceEnum::Vgpr, FloatGemmAcc, acc1_thread_buf.Size(), true>
@@ -1046,6 +1083,7 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
                 // main body
                 if constexpr(num_gemm1_k_block_inner_loop > 1)
                 {
+
                     static_for<0, num_gemm1_k_block_inner_loop - 1, 1>{}([&](auto i) {
                         a1_blockwise_copy.Run(acc_thread_desc_k0_m_k1,
                                               make_tuple(Number<i * A1ThreadSliceK0>{}, I0, I0),
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
index 1754e07e6a..502c449ef1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
+#include "ck/utility/env.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -569,26 +570,33 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
 
         if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
         {
-            printf("GridwiseOp: M/N Length err, A_M/N = %d, %d | C_M/N = %d, %d\n",
-                   M,
-                   N,
-                   c_grid_desc_m_n.GetLength(I0),
-                   c_grid_desc_m_n.GetLength(I1));
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: M/N Length err, A_M/N = %d, %d | C_M/N = %d, %d\n",
+                       M,
+                       N,
+                       c_grid_desc_m_n.GetLength(I0),
+                       c_grid_desc_m_n.GetLength(I1));
+            }
             return false;
         }
 
         if(!(M % MPerBlock == 0 && L % LPerBlock == 0 && K % KPerBlock == 0 && N % NPerBlock == 0))
         {
-            printf("GridwiseOp: M/L/K/N Division err, M/L/K/N = %d, %d, %d, %d | M/L/K/NPerBlock = "
-                   "%d, %d, %d, %d\n",
-                   M,
-                   L,
-                   K,
-                   N,
-                   MPerBlock,
-                   LPerBlock,
-                   KPerBlock,
-                   NPerBlock);
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: M/L/K/N Division err, M/L/K/N = %d, %d, %d, %d | "
+                       "M/L/K/NPerBlock = "
+                       "%d, %d, %d, %d\n",
+                       M,
+                       L,
+                       K,
+                       N,
+                       MPerBlock,
+                       LPerBlock,
+                       KPerBlock,
+                       NPerBlock);
+            }
             return false;
         }
 
@@ -596,23 +604,32 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
         const auto num_gemm0_k_loop = K / KPerBlock;
         if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop))
         {
-            printf("GridwiseOp: outer loop unsupport\n");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: outer loop unsupport\n");
+            }
             return false;
         }
 
         // check gemm1 gridwise gemm pipeline
         if(!(LPerBlock % LTilePerBlock == 0))
         {
-            printf("GridwiseOp: inner loop division, L/LTilePerblock: %d, %d\n",
-                   LPerBlock,
-                   LTilePerBlock);
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: inner loop division, L/LTilePerblock: %d, %d\n",
+                       LPerBlock,
+                       LTilePerBlock);
+            }
             return false;
         }
 
         const auto num_gemm1_k_inner_loop = LPerBlock / LTilePerBlock;
         if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop))
         {
-            printf("GridwiseOp: inner loop unsupport\n");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: inner loop unsupport\n");
+            }
             return false;
         }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
index 33b9199ea5..e0cf12e429 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -209,6 +209,22 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
         return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end);
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
@@ -570,17 +586,32 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
         constexpr auto A1ThreadSlice_K0_M_K1 =
             make_tuple(Number<Gemm1KPerBlock / n4 / AccN3>{}, Number<m0 * m1 * m2>{}, Number<n4>{});
 
-        constexpr auto A1ThreadSliceK0        = A1ThreadSlice_K0_M_K1[I0];
-        constexpr auto A1ThreadSliceM         = A1ThreadSlice_K0_M_K1[I1];
-        constexpr auto A1ThreadSliceK1        = A1ThreadSlice_K0_M_K1[I2];
+        constexpr auto A1ThreadSliceK0 = A1ThreadSlice_K0_M_K1[I0];
+        constexpr auto A1ThreadSliceM  = A1ThreadSlice_K0_M_K1[I1];
+        constexpr auto A1ThreadSliceK1 = A1ThreadSlice_K0_M_K1[I2];
+
+        // A1 matrix blockwise copy
+#if defined(__gfx11__)
+        constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor_packed(
+            make_tuple(A1ThreadSliceK0, A1ThreadSliceM, Number<A1ThreadSliceK1 * 2>{}));
+        auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow<
+            FloatGemmAcc,
+            FloatAB,
+            decltype(acc_thread_desc_k0_m_k1),
+            decltype(a1_thread_desc_k0_m_k1),
+            tensor_operation::element_wise::PassThrough,
+            Sequence<A1ThreadSliceK0, A1ThreadSliceM, A1ThreadSliceK1>,
+            Sequence<1, 0, 2>,
+            2,
+            n4,
+            0x76543210,
+            0xfedcba98,
+            false>{make_tuple(0, 0, 0)};
+        static_assert(n4 == A1ThreadSliceK1);
+#else
         constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor(
             A1ThreadSlice_K0_M_K1,
             make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1));
-
-        // B1 matrix in LDS memory, dst of blockwise copy
-        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-
-        // A1 matrix blockwise copy
         auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic<
             FloatGemmAcc,
             FloatAB,
@@ -591,6 +622,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
             Sequence<1, 0, 2>,
             2,
             n4>{tensor_operation::element_wise::PassThrough{}};
+#endif
+
+        // B1 matrix in LDS memory, dst of blockwise copy
+        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
 
         // B1 matrix blockwise copy
         auto b1_blockwise_copy =
@@ -640,9 +675,13 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
         // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
         // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
         // therefore we may just as well assign Gemm1KPack = group_size
-
+#if defined(__gfx11__)
+        constexpr index_t Gemm1KPack =
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size * 2;
+#else
         constexpr index_t Gemm1KPack =
             MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size;
+#endif
 
         auto gemm1_blockwise_gemm = BlockwiseGemmXdlops_v2<
             BlockSize,
@@ -716,8 +755,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
                                                   decltype(thread_slice_desc_m_n)>{};
 
         const index_t num_gemm1_k_block_outer_loop =
-            b_grid_desc_bk0_n_bk1.GetLength(I1) / NPerBlock;
-        constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm1KPerBlock;
+            b_grid_desc_bk0_n_bk1.GetLength(I1) / NPerBlock * Gemm0NWaves;
+        constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm1KPerBlock / Gemm0NWaves;
 
         // Initialize C
         StaticBuffer<AddressSpaceEnum::Vgpr, FloatGemmAcc, acc1_thread_buf.Size(), true>
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
index 8011fa56d3..c8b154228f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
+#include "ck/utility/env.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -466,20 +467,26 @@ struct GridwiseFpAintBGemm_Wmma
         if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
              K == GetBProblemsizeNK()[I1]))
         {
-            printf("A: MxK = %d x %d, B: NxK = %d x %d, C: MxN = %d x %d\n",
-                   GetAProblemsizeMK()[I0],
-                   GetAProblemsizeMK()[I1],
-                   GetBProblemsizeNK()[I0],
-                   GetBProblemsizeNK()[I1],
-                   c_grid_desc_m_n.GetLength(I0),
-                   c_grid_desc_m_n.GetLength(I1));
-            printf("GridwiseOp err: ProblemSize check");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("A: MxK = %d x %d, B: NxK = %d x %d, C: MxN = %d x %d\n",
+                       GetAProblemsizeMK()[I0],
+                       GetAProblemsizeMK()[I1],
+                       GetBProblemsizeNK()[I0],
+                       GetBProblemsizeNK()[I1],
+                       c_grid_desc_m_n.GetLength(I0),
+                       c_grid_desc_m_n.GetLength(I1));
+                printf("GridwiseOp err: ProblemSize check");
+            }
             return false;
         }
 
         if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
         {
-            printf("GridwiseOp err: ProblemSize division");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp err: ProblemSize division");
+            }
             return false;
         }
 
@@ -488,7 +495,10 @@ struct GridwiseFpAintBGemm_Wmma
 
         if(!GridwiseGemmPipe::IsSupported(num_k_loop))
         {
-            printf("GridwiseOp err: Pipeline not support this k_loop");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp err: Pipeline not support this k_loop");
+            }
             return false;
         }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
index 96b737385a..0a4691b509 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -66,29 +66,34 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  p_bias_grid,
-                                                  p_d0_grid,
-                                                  p_reduces_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  c1_element_op,
-                                                  reduce_in_element_ops,
-                                                  reduce_out_element_ops,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  c0_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  c1_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  reduce_grid_desc_mblock_mperblock,
-                                                  block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            p_bias_grid,
+            p_d0_grid,
+            p_reduces_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            c_element_op,
+            c1_element_op,
+            reduce_in_element_ops,
+            reduce_out_element_ops,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+            c0_grid_desc_mblock_mperblock_nblock_nperblock,
+            c1_grid_desc_mblock_mperblock_nblock_nperblock,
+            reduce_grid_desc_mblock_mperblock,
+            block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -252,6 +257,22 @@ struct GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                          c_block_size * sizeof(FloatCShuffle));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
index c37ffb6263..ad28a12e57 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -107,8 +107,11 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
     using BComputeDataType =
         conditional_t<is_same_v<BComputeDataType_, ck::half_t>, ck::bhalf_t, BComputeDataType_>;
 #else
-    using AComputeDataType = AComputeDataType_;
-    using BComputeDataType = BComputeDataType_;
+    // Element data type is used in LDS and registers. ComputeDataType_ is inside mfma, eg tf32.
+    using AElementDataType =
+        conditional_t<is_same_v<AComputeDataType_, ck::tf32_t>, float, AComputeDataType_>;
+    using BElementDataType =
+        conditional_t<is_same_v<BComputeDataType_, ck::tf32_t>, float, BComputeDataType_>;
 #endif
 
     __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
@@ -199,8 +202,8 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
         constexpr auto c_block_size =
             c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
 
-        return math::max(a_block_space_size_aligned * sizeof(AComputeDataType) +
-                             b_block_space_size_aligned * sizeof(BComputeDataType),
+        return math::max(a_block_space_size_aligned * sizeof(AElementDataType) +
+                             b_block_space_size_aligned * sizeof(BElementDataType),
                          c_block_size * sizeof(CShuffleDataType));
     }
 
@@ -298,6 +301,8 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
             e_grid_desc_m_n);
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(EDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename AsGridDesc_M_K,
               typename BsGridDesc_N_K,
@@ -313,6 +318,7 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
         static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
                           (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
+
         static_assert(KPerBlock % AK1Value == 0 && KPerBlock % BK1Value == 0,
                       "KPerBlock must be divisible by AK1Value and BK1Value!");
 
@@ -618,7 +624,7 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
         auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v7r2<
             ThisThreadBlock,
             AsDataType,
-            Tuple<AComputeDataType>,
+            Tuple<AElementDataType>,
             decltype(as_grid_desc_ak0_m_ak1),
             decltype(tie(a_block_desc_ak0_m_ak1)),
             AElementwiseOperation,
@@ -646,7 +652,7 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
         auto b_blockwise_copy = ThreadGroupTensorSliceTransfer_v7r2<
             ThisThreadBlock,
             BsDataType,
-            Tuple<BComputeDataType>,
+            Tuple<BElementDataType>,
             decltype(bs_grid_desc_bk0_n_bk1),
             decltype(tie(b_block_desc_bk0_n_bk1)),
             BElementwiseOperation,
@@ -676,27 +682,28 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            (((is_same<AComputeDataType, half_t>::value ||
-               is_same<AComputeDataType, bhalf_t>::value) &&
+            (((is_same<AComputeDataType_, half_t>::value ||
+               is_same<AComputeDataType_, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
-             ((is_same<AComputeDataType, f8_t>::value || is_same<AComputeDataType, bf8_t>::value) &&
+             (is_same<AComputeDataType_, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<AComputeDataType_, f8_t>::value ||
+               is_same<AComputeDataType_, bf8_t>::value) &&
               lcm_AK1_BK1 < 32))
                 ? true
                 : false;
         static constexpr auto is_scale_mfma = false;
         constexpr index_t KPack             = math::max(lcm_AK1_BK1,
-                                            MfmaSelector<AComputeDataType,
+                                            MfmaSelector<AComputeDataType_,
                                                                      MPerXdl,
                                                                      NPerXdl,
-                                                                     BComputeDataType,
+                                                                     BComputeDataType_,
                                                                      is_single_rate_mfma,
                                                                      is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
-            AComputeDataType,
-            BComputeDataType,
+            AElementDataType,
+            BElementDataType,
             AccDataType,
             decltype(a_block_desc_ak0_m_ak1),
             decltype(b_block_desc_bk0_n_bk1),
@@ -705,7 +712,9 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
             MXdlPerWave,
             NXdlPerWave,
             KPack,
-            LoopSched>();
+            LoopSched,
+            AComputeDataType_,
+            BComputeDataType_>();
 
         auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
 
@@ -714,10 +723,10 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
             a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
 
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<AComputeDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<AElementDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
 
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<BComputeDataType*>(p_shared) + a_block_space_size_aligned,
+            static_cast<BElementDataType*>(p_shared) + a_block_space_size_aligned,
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index df5c8b10f3..5411137bf4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -201,6 +201,8 @@ struct GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(FloatE)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2ETileMap>
     __host__ __device__ static constexpr bool CheckValidity(const AGridDesc_M_K& a_grid_desc_m_k,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
index 46979a5620..7d68d64ed8 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
+#include "ck/utility/env.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -653,13 +654,19 @@ struct GridwiseGemmMultipleD_Wmma
         if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1) &&
              K == GetBProblemsizeNK()[I1]))
         {
-            printf("GridwiseOp: ABE descriptor dimension cross check failure\n");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: ABE descriptor dimension cross check failure\n");
+            }
             return false;
         }
 
         if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
         {
-            printf("GridwiseOp: Problemsize descriptor dimension check failure\n");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: Problemsize descriptor dimension check failure\n");
+            }
             return false;
         }
 
@@ -747,20 +754,29 @@ struct GridwiseGemmMultipleD_Wmma
 
         if(!valid)
         {
-            printf("GridwiseOp: D descriptor dimension check failure\n");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: D descriptor dimension check failure\n");
+            }
             return false;
         }
 
         if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1) &&
              K == GetBProblemsizeNK()[I1]))
         {
-            printf("GridwiseOp: ABE descriptor dimension cross check failure\n");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: ABE descriptor dimension cross check failure\n");
+            }
             return false;
         }
 
         if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
         {
-            printf("GridwiseOp: Problemsize descriptor dimension check failure\n");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp: Problemsize descriptor dimension check failure\n");
+            }
             return false;
         }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index 318ff59383..1d9b7eb978 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -107,8 +107,10 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
     using BComputeDataType =
         conditional_t<is_same_v<BComputeDataType_, ck::half_t>, ck::bhalf_t, BComputeDataType_>;
 #else
-    using AComputeDataType = AComputeDataType_;
-    using BComputeDataType = BComputeDataType_;
+    using AComputeDataType =
+        conditional_t<is_same_v<AComputeDataType_, ck::tf32_t>, float, AComputeDataType_>;
+    using BComputeDataType =
+        conditional_t<is_same_v<BComputeDataType_, ck::tf32_t>, float, BComputeDataType_>;
 #endif
 
     __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
@@ -322,6 +324,8 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         return true;
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(EDataType)
+
     template <typename AGridDesc_M_K,
               typename BGridDesc_N_K,
               typename DsGridDesc_M_N,
@@ -337,6 +341,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
                           (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
+
         static_assert(KPerBlock % AK1Value == 0 && KPerBlock % BK1Value == 0,
                       "KPerBlock must be divisible by AK1Value and BK1Value!");
 
@@ -556,9 +561,10 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
             return;
         }
 
-        const index_t num_k_per_block =
+        const index_t num_ak0_per_block =
             __builtin_amdgcn_readfirstlane(a_grid_desc_ak0_m_ak1.GetLength(I0) / k_batch);
-
+        const index_t num_bk0_per_block =
+            __builtin_amdgcn_readfirstlane(b_grid_desc_bk0_n_bk1.GetLength(I0) / k_batch);
         // HACK: this force m/n_block_data_idx_on_grid into SGPR
         const index_t m_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
@@ -600,7 +606,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                 true,
                                                 NumGemmKPrefetchStage>(
                 a_grid_desc_ak0_m_ak1,
-                make_multi_index(num_k_per_block * k_idx, m_block_data_idx_on_grid, 0),
+                make_multi_index(num_ak0_per_block * k_idx, m_block_data_idx_on_grid, 0),
                 a_element_op,
                 a_block_desc_ak0_m_ak1,
                 make_multi_index(0, 0, 0),
@@ -631,7 +637,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                 true,
                                                 NumGemmKPrefetchStage>(
                 b_grid_desc_bk0_n_bk1,
-                make_multi_index(num_k_per_block * k_idx, n_block_data_idx_on_grid, 0),
+                make_multi_index(num_bk0_per_block * k_idx, n_block_data_idx_on_grid, 0),
                 b_element_op,
                 b_block_desc_bk0_n_bk1,
                 make_multi_index(0, 0, 0),
@@ -656,26 +662,27 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                 : false;
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_AK1_BK1,
-                                            MfmaSelector<AComputeDataType,
+                                            MfmaSelector<AComputeDataType_,
                                                               MPerXdl,
                                                               NPerXdl,
-                                                              BComputeDataType,
+                                                              BComputeDataType_,
                                                               is_single_rate_mfma,
                                                               is_scale_mfma>::selected_mfma.k_per_blk);
-
-        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
-            BlockSize,
-            AComputeDataType,
-            BComputeDataType,
-            AccDataType,
-            decltype(a_block_desc_ak0_m_ak1),
-            decltype(b_block_desc_bk0_n_bk1),
-            MPerXdl,
-            NPerXdl,
-            MXdlPerWave,
-            NXdlPerWave,
-            KPack,
-            LoopSched>();
+        auto blockwise_gemm          = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+                     BlockSize,
+                     AComputeDataType,
+                     BComputeDataType,
+                     AccDataType,
+                     decltype(a_block_desc_ak0_m_ak1),
+                     decltype(b_block_desc_bk0_n_bk1),
+                     MPerXdl,
+                     NPerXdl,
+                     MXdlPerWave,
+                     NXdlPerWave,
+                     KPack,
+                     LoopSched,
+                     AComputeDataType_,
+                     BComputeDataType_>();
 
         auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
index bd9b08f8f9..1e72e78349 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -57,21 +57,25 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const Block2ETileMap block_2_etile_map)
 {
 #if(defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_ds_grid,
-                                                  p_e_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid,
+            p_b_grid,
+            p_ds_grid,
+            p_e_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_etile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -140,7 +144,7 @@ template <typename ALayout,
           index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
           LoopScheduler LoopSched,
           PipelineVersion PipelineVer = PipelineVersion::v4,
-          typename BComputeDataType   = AComputeDataType_>
+          typename BComputeDataType_  = AComputeDataType_>
 struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
@@ -168,7 +172,10 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
     using AComputeDataType =
         conditional_t<is_same_v<AComputeDataType_, ck::half_t>, ck::bhalf_t, AComputeDataType_>;
 #else
-    using AComputeDataType = AComputeDataType_;
+    using AComputeDataType =
+        conditional_t<is_same_v<AComputeDataType_, ck::tf32_t>, float, AComputeDataType_>;
+    using BComputeDataType =
+        conditional_t<is_same_v<BComputeDataType_, ck::tf32_t>, float, BComputeDataType_>;
 #endif
 
     __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
@@ -424,6 +431,8 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
 
     using Block2ETileMap = remove_cvref_t<decltype(MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(EDataType)
+
     __host__ __device__ static constexpr bool CheckValidity(const AGridDesc_M_K& a_grid_desc_m_k,
                                                             const BGridDesc_N_K& b_grid_desc_n_k,
                                                             const DsGridDesc_M_N& ds_grid_desc_m_n,
@@ -433,6 +442,7 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
         static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
                           (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
+
         static_assert(KPerBlock % AK1Value == 0 && KPerBlock % BK1Value == 0,
                       "KPerBlock must be divisible by AK1Value and BK1Value!");
 
@@ -566,7 +576,6 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
         // This forces m/n_block_data_idx_on_grid into SGPR.
         const index_t m_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
-
         const index_t n_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
 
@@ -633,10 +642,10 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
         constexpr auto is_scale_mfma = false;
 
         constexpr index_t KPack = math::max(lcm_AK1_BK1,
-                                            MfmaSelector<AComputeDataType,
+                                            MfmaSelector<AComputeDataType_,
                                                          MPerXdl,
                                                          NPerXdl,
-                                                         BComputeDataType,
+                                                         BComputeDataType_,
                                                          is_single_rate_mfma,
                                                          is_scale_mfma>::selected_mfma.k_per_blk);
 
@@ -652,7 +661,9 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
             MXdlPerWave,
             NXdlPerWave,
             KPack,
-            LoopSched>();
+            LoopSched,
+            AComputeDataType_,
+            BComputeDataType_>();
 
         auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
index 85b5b5faab..872e1271e1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -351,6 +351,8 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
             e_grid_desc_m_n);
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(EDataType)
+
     template <typename ALayout,
               typename BLayout,
               typename DsLayout,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index 010b2144b9..d3479fe0b4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -54,24 +54,28 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  p_reduces_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  reduce_in_element_ops,
-                                                  reduce_out_element_ops,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  reduce_grid_desc_mblock_mperblock,
-                                                  block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                      p_b_grid,
+                                                      p_c_grid,
+                                                      p_reduces_grid,
+                                                      p_shared,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op,
+                                                      reduce_in_element_ops,
+                                                      reduce_out_element_ops,
+                                                      a_grid_desc_ak0_m_ak1,
+                                                      b_grid_desc_bk0_n_bk1,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      reduce_grid_desc_mblock_mperblock,
+                                                      block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -225,6 +229,22 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                          c_block_size * sizeof(FloatCShuffle));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
index b4848c7077..8f7aac0171 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -283,6 +283,8 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
             e_grid_desc_m_n, 8, split_k);
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(EDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename AGridDesc_AKB_AK0_M_AK1,
               typename BGridDesc_BKB_BK0_N_BK1,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
index 1b4c2666ab..9d34f9e2a4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -348,6 +348,8 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
             e_grid_desc_m_n);
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(EDataType)
+
     template <typename ALayout,
               typename BLayout,
               typename DsLayout,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
index 4a15958adb..65f74de3cf 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
+#include "ck/utility/env.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -458,20 +459,26 @@ struct GridwiseGemm_Wmma
         if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
              K == GetBProblemsizeNK()[I1]))
         {
-            printf("A: MxK = %d x %d, B: NxK = %d x %d, C: MxN = %d x %d\n",
-                   GetAProblemsizeMK()[I0],
-                   GetAProblemsizeMK()[I1],
-                   GetBProblemsizeNK()[I0],
-                   GetBProblemsizeNK()[I1],
-                   c_grid_desc_m_n.GetLength(I0),
-                   c_grid_desc_m_n.GetLength(I1));
-            printf("GridwiseOp err: ProblemSize check");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("A: MxK = %d x %d, B: NxK = %d x %d, C: MxN = %d x %d\n",
+                       GetAProblemsizeMK()[I0],
+                       GetAProblemsizeMK()[I1],
+                       GetBProblemsizeNK()[I0],
+                       GetBProblemsizeNK()[I1],
+                       c_grid_desc_m_n.GetLength(I0),
+                       c_grid_desc_m_n.GetLength(I1));
+                printf("GridwiseOp err: ProblemSize check");
+            }
             return false;
         }
 
         if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
         {
-            printf("GridwiseOp err: ProblemSize division");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp err: ProblemSize division");
+            }
             return false;
         }
 
@@ -480,7 +487,10 @@ struct GridwiseGemm_Wmma
 
         if(!GridwiseGemmPipe::IsSupported(num_k_loop))
         {
-            printf("GridwiseOp err: Pipeline not support this k_loop");
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("GridwiseOp err: Pipeline not support this k_loop");
+            }
             return false;
         }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
index 9a8d09e5e4..d226510cf0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
@@ -11,7 +11,8 @@
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
-#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp"
@@ -22,9 +23,10 @@ namespace ck {
 ///
 /// @par Overview
 ///         This GEMM kernel is carrying out following mathematical equation:
-///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
-///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
-///         elementwise operations that could be applied on each tensor respectively.
+///         E{M,N} = CDE_op(A_op(A{M,K}) * B_op(B{K,N}), Ds{M,N}...)
+///         Where A, B, Ds are input tensors and E is the output tensor. The A/B are elementwise
+//          operations that could be applied on each tensor respectively. The CDE_op is an
+//          elementwise operation applied to the C and all D tensors.
 ///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
 ///         scenarios. That's why it's called \"universal\". It's universal through it's design
 ///         and versatilty.
@@ -36,18 +38,20 @@ namespace ck {
 ///
 /// @tparam ALayout     A tensor data layout.
 /// @tparam BLayout     B tensor data layout.
-/// @tparam CLayout     C tensor data layout.
-/// @tparam ADataType   A tensor data type.
-/// @tparam BDataType   B tensor data type.
+/// @tparam DsLayout    D tensors data layouts.
+/// @tparam ELayout     E tensor data layout.
+/// @tparam AsDataType  A tensors data types.
+/// @tparam BsDataType  B tensors data types.
 /// @tparam AccDataType The accumulation data type related to the hardware
 ///                         matrix-multiplication instruction.
 /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
 ///                          LDS memory during \"CShuffle\" data layout optimization.
-/// @tparam CDataType   C tensor data type.
-/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
-/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
-/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
-///                               (after GEMM).
+/// @tparam DsDataType  D tensors data types.
+/// @tparam EDataType   E tensor data type.
+/// @tparam AElementwiseOperation   Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation   Elementwise operation applied to the B input tensor elements.
+/// @tparam CDEElementwiseOperation Elementwise operation applied to the C output tensor (after
+///                                 GEMM) and D input tensors.
 /// @tparam GemmSpec    Determines used "padding" version.
 /// @tparam BlockSize   The number of threads within workgroup.
 /// @tparam MPerBlock   The input/output data tile size in the M dimension.
@@ -105,11 +109,12 @@ namespace ck {
 /// @tparam CShuffleNRepeatPerShuffle   The number of matrix-multiplication instructions
 ///                                         results to process per wave per iteration of CShuffle
 ///                                         in N dimension.
-/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+/// @tparam CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
 ///                                         thread distribution used for storing data into output
 ///                                         tensor across output data layout dimensions.
-/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
-///                                         Used when storing data to output tensor.
+/// @tparam CDEShuffleBlockTransferScalarPerVectors The size of vectorized memory access.
+///                                         Used when loading data from D tensors and storing data
+///                                         to output tensor.
 /// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
 ///                             intrawave).
 /// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
@@ -123,15 +128,17 @@ namespace ck {
 ///                             in global memory (pre-shuffled).
 template <typename ALayout,
           typename BLayout,
-          typename CLayout,
-          typename ADataType,
-          typename BDataType,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
           typename AccDataType,
           typename CShuffleDataType,
-          typename CDataType,
+          typename DsDataType,
+          typename EDataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
-          typename CElementwiseOperation,
+          typename CDEElementwiseOperation,
           tensor_operation::device::GemmSpecialization GemmSpec,
           index_t BlockSize,
           index_t MPerBlock,
@@ -161,8 +168,8 @@ template <typename ALayout,
           index_t BBlockLdsExtraN,
           index_t CShuffleMRepeatPerShuffle,
           index_t CShuffleNRepeatPerShuffle,
-          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
           BlockGemmPipelineScheduler BlkGemmPipeSched,
           BlockGemmPipelineVersion BlkGemmPipelineVer,
           typename ComputeTypeA,
@@ -173,15 +180,17 @@ struct GridwiseGemm_wmma_cshuffle_v3
     : GridwiseGemm_wmma_cshuffle_v3_base<
           ALayout,
           BLayout,
-          CLayout,
-          ADataType,
-          BDataType,
+          DsLayout,
+          ELayout,
+          AsDataType,
+          BsDataType,
           AccDataType,
           CShuffleDataType,
-          CDataType,
+          DsDataType,
+          EDataType,
           AElementwiseOperation,
           BElementwiseOperation,
-          CElementwiseOperation,
+          CDEElementwiseOperation,
           GemmSpec,
           BlockSize,
           MPerBlock,
@@ -211,8 +220,8 @@ struct GridwiseGemm_wmma_cshuffle_v3
           BBlockLdsExtraN,
           CShuffleMRepeatPerShuffle,
           CShuffleNRepeatPerShuffle,
-          CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          CShuffleBlockTransferScalarPerVector_NPerBlock,
+          CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          CDEShuffleBlockTransferScalarPerVectors,
           BlkGemmPipeSched,
           BlkGemmPipelineVer,
           ComputeTypeA,
@@ -223,15 +232,17 @@ struct GridwiseGemm_wmma_cshuffle_v3
     using Base = GridwiseGemm_wmma_cshuffle_v3_base<
         ALayout,
         BLayout,
-        CLayout,
-        ADataType,
-        BDataType,
+        DsLayout,
+        ELayout,
+        AsDataType,
+        BsDataType,
         AccDataType,
         CShuffleDataType,
-        CDataType,
+        DsDataType,
+        EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        CElementwiseOperation,
+        CDEElementwiseOperation,
         GemmSpec,
         BlockSize,
         MPerBlock,
@@ -261,8 +272,8 @@ struct GridwiseGemm_wmma_cshuffle_v3
         BBlockLdsExtraN,
         CShuffleMRepeatPerShuffle,
         CShuffleNRepeatPerShuffle,
-        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
         BlkGemmPipeSched,
         BlkGemmPipelineVer,
         ComputeTypeA,
@@ -295,34 +306,47 @@ struct GridwiseGemm_wmma_cshuffle_v3
     using Base::CalculateMPadded;
     using Base::CalculateNBlock;
     using Base::CalculateNPadded;
-    using Base::MakeAGridDescriptor_AK0_M_AK1;
-    using Base::MakeBGridDescriptor_BK0_N_BK1;
-    using Base::MakeCGridDescriptor_M_N;
+    using Base::MakeAsGridDescriptor_AK0_M_AK1;
+    using Base::MakeBsGridDescriptor_BK0_N_BK1;
+    using Base::MakeDEGridDescriptor_M_N;
+    using Base::MakeDsGridDescriptor_M_N;
+    using Base::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock;
 
     using Base::GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat;
 
-    using Base::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock;
+    using Base::MakeDEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock;
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
     using Base::GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1;
     using Base::GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1;
 
+    using Base::NumATensor;
+    using Base::NumBTensor;
+    using Base::NumDTensor;
+    using typename Base::AsGridPointer;
+    using typename Base::BsGridPointer;
+    using typename Base::DsGridPointer;
+    using AsDataType_ = AsDataType;
+    using BsDataType_ = BsDataType;
+
     struct Problem
     {
         __host__ Problem(index_t M_,
                          index_t N_,
                          index_t K_,
-                         index_t StrideA_,
-                         index_t StrideB_,
-                         index_t StrideC_,
+                         std::array<index_t, NumATensor> StrideAs_,
+                         std::array<index_t, NumBTensor> StrideBs_,
+                         std::array<index_t, NumDTensor> StrideDs_,
+                         index_t StrideE_,
                          index_t KBatch_)
             : M{M_},
               N{N_},
               K{K_},
-              StrideA{StrideA_},
-              StrideB{StrideB_},
-              StrideC{StrideC_},
+              StrideAs{StrideAs_},
+              StrideBs{StrideBs_},
+              StrideDs{StrideDs_},
+              StrideE{StrideE_},
               KBatch{KBatch_},
               MPadded{CalculateMPadded(M_)},
               NPadded{CalculateNPadded(N_)},
@@ -338,19 +362,36 @@ struct GridwiseGemm_wmma_cshuffle_v3
         __host__ void Print() const
         {
             std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
-                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
-                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
-                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+                      << "SAs: {";
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                std::cout << StrideAs[i] << (i.value < NumATensor - 1 ? ", " : "");
+            });
+            std::cout << "}, " << "SBs: {";
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                std::cout << StrideBs[i] << (i.value < NumBTensor - 1 ? ", " : "");
+            });
+            std::cout << "}, ";
+            if constexpr(NumDTensor > 0)
+            {
+                std::cout << "SDs: { ";
+                static_for<0, NumDTensor, 1>{}([&](auto i) {
+                    std::cout << StrideDs[i] << (i.value < NumDTensor - 1 ? ", " : "");
+                });
+                std::cout << " }, ";
+            }
+            std::cout << "SE:" << StrideE << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded
+                      << ", " << "KRead:" << KRead << ", " << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock
+                      << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t M;
         index_t N;
         index_t K;
-        index_t StrideA;
-        index_t StrideB;
-        index_t StrideC;
+        std::array<index_t, NumATensor> StrideAs;
+        std::array<index_t, NumBTensor> StrideBs;
+        std::array<index_t, NumDTensor> StrideDs;
+        index_t StrideE;
         index_t KBatch;
         index_t MPadded;
         index_t NPadded;
@@ -365,23 +406,55 @@ struct GridwiseGemm_wmma_cshuffle_v3
     // Argument
     struct Argument : public tensor_operation::device::BaseArgument, public Problem
     {
-        __host__ Argument(const ADataType* p_a_grid_,
-                          const BDataType* p_b_grid_,
-                          CDataType* p_c_grid_,
+        __host__ Argument(std::array<const void*, NumATensor> p_as_grid_,
+                          std::array<const void*, NumBTensor> p_bs_grid_,
+                          std::array<const void*, NumDTensor> p_ds_grid_,
+                          EDataType* p_e_grid_,
                           index_t M_,
                           index_t N_,
                           index_t K_,
-                          index_t StrideA_,
-                          index_t StrideB_,
-                          index_t StrideC_,
+                          std::array<index_t, NumATensor> StrideAs_,
+                          std::array<index_t, NumBTensor> StrideBs_,
+                          std::array<index_t, NumDTensor> StrideDs_,
+                          index_t StrideE_,
                           index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CDEElementwiseOperation cde_element_op_,
                           bool is_reduce_ = false)
-            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, k_batch_},
-              p_a_grid{p_a_grid_},
-              p_b_grid{p_b_grid_},
-              p_c_grid{p_c_grid_},
+            : Problem{M_, N_, K_, StrideAs_, StrideBs_, StrideDs_, StrideE_, k_batch_},
+              p_as_grid{},
+              p_bs_grid{},
+              p_ds_grid{},
+              p_e_grid{p_e_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              cde_element_op{cde_element_op_},
               is_reduce(is_reduce_)
         {
+            // populate pointer, desc for As
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                using ADataType_ = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+
+                // A pointer
+                p_as_grid(i) = static_cast<const ADataType_*>(p_as_grid_[i]);
+            });
+
+            // populate pointer, desc for Bs
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                using BDataType_ = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+
+                // B pointer
+                p_bs_grid(i) = static_cast<const BDataType_*>(p_bs_grid_[i]);
+            });
+
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid(i) = static_cast<const DDataType*>(p_ds_grid_[i]);
+            });
         }
 
         __host__ __device__ inline bool IsReduceAdd() const
@@ -394,44 +467,61 @@ struct GridwiseGemm_wmma_cshuffle_v3
             return (Problem::KBatch > 1) && (!is_reduce);
         }
 
-        const ADataType* p_a_grid;
-        const BDataType* p_b_grid;
-        CDataType* p_c_grid;
+        AsGridPointer p_as_grid;
+        BsGridPointer p_bs_grid;
+        DsGridPointer p_ds_grid;
+        EDataType* p_e_grid;
+
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CDEElementwiseOperation cde_element_op;
+
+        // TODO: it can be used with SplitK+reduction but currently only used with SplitK+atomicAdd
         bool is_reduce;
     };
 
     struct SplitKBatchOffset
     {
 
-        __device__ SplitKBatchOffset(Argument& karg)
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
         {
+            // Note: in xdl implementation multiple AB supports one layout
+            // but multiple strides, so we create an array of offsets with
+            // the same values.
+            // It should be fixed later on. Once we will have a thread transfer
+            // more flexible.
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
             {
-                a_k_split_offset = blockIdx.z * karg.KRead / APackedSize;
+                static_for<0, NumATensor, 1>{}(
+                    [&](auto i) { a_k_split_offset[i] = k_id * karg.KRead / APackedSize; });
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
             {
-                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+                static_for<0, NumATensor, 1>{}(
+                    [&](auto i) { a_k_split_offset[i] = k_id * karg.KRead * karg.StrideAs[i]; });
             }
 
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
             {
-                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+                static_for<0, NumBTensor, 1>{}(
+                    [&](auto i) { b_k_split_offset[i] = k_id * karg.KRead * karg.StrideBs[i]; });
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
             {
                 if constexpr(!PermuteB)
                 {
-                    b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
+                    static_for<0, NumBTensor, 1>{}(
+                        [&](auto i) { b_k_split_offset[i] = k_id * karg.KRead / BPackedSize; });
                 }
                 else
                 {
                     const int k0_offset = karg.KRead * karg.N;
-                    b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
+                    static_for<0, NumBTensor, 1>{}(
+                        [&](auto i) { b_k_split_offset[i] = k_id * k0_offset / BPackedSize; });
                 }
             }
 
-            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            if(k_id < karg.KBatch - 1)
             {
                 karg.K = karg.KRead;
             }
@@ -442,7 +532,7 @@ struct GridwiseGemm_wmma_cshuffle_v3
 
             if(karg.IsReduceAdd())
             {
-                c_reduce_offset = blockIdx.z * karg.M * karg.N;
+                c_reduce_offset = k_id * karg.M * karg.N;
             }
             else
             {
@@ -450,8 +540,8 @@ struct GridwiseGemm_wmma_cshuffle_v3
             }
         }
 
-        index_t a_k_split_offset;
-        index_t b_k_split_offset;
+        std::array<index_t, NumATensor> a_k_split_offset;
+        std::array<index_t, NumBTensor> b_k_split_offset;
         index_t c_reduce_offset;
     };
 
@@ -465,23 +555,32 @@ struct GridwiseGemm_wmma_cshuffle_v3
     __device__ static index_t GetKBlockPerScale() { return 1; }
 
     template <bool HasMainKBlockLoop,
-              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run(const ADataType* p_a_grid,
-                               const BDataType* p_b_grid,
-                               CDataType* p_c_grid,
+              InMemoryDataOperationEnum EGlobalMemoryDataOperation,
+              TailNumber TailNum>
+    __device__ static void Run(AsGridPointer& p_as_grid,
+                               BsGridPointer& p_bs_grid,
+                               DsGridPointer& p_ds_grid,
+                               EDataType* p_e_grid,
                                void* p_shared,
-                               const Problem& problem)
+                               const Problem& problem,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CDEElementwiseOperation cde_element_op)
     {
-        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
-            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
-        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
-            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
-        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
-            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
-        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
-            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const auto as_grid_desc_ak0_m_ak1 = MakeAsGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideAs, problem.AK0);
+        const auto bs_grid_desc_bk0_n_bk1 = MakeBsGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideBs, problem.BK0);
+        const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+        const auto e_grid_desc_m_n = Base::template MakeDEGridDescriptor_M_N<ELayout>(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideE);
+        const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeDEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                e_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
         // divide block work by [M, N]
         const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
@@ -491,8 +590,8 @@ struct GridwiseGemm_wmma_cshuffle_v3
 
         if(!block_2_ctile_map.ValidCTileIndex(
                block_work_idx,
-               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
-                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+               make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
         {
             return;
         }
@@ -506,19 +605,25 @@ struct GridwiseGemm_wmma_cshuffle_v3
 
         const index_t num_k_block_per_scale = GetKBlockPerScale();
 
-        Base::template Run<decltype(a_grid_desc_ak0_m_ak1),
-                           decltype(b_grid_desc_bk0_n_bk1),
-                           decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+        Base::template Run<decltype(as_grid_desc_ak0_m_ak1),
+                           decltype(bs_grid_desc_bk0_n_bk1),
+                           decltype(ds_grid_desc_mblock_mperblock_nblock_nperblock),
+                           decltype(e_grid_desc_mblock_mperblock_nblock_nperblock),
                            decltype(b_scale_struct),
                            HasMainKBlockLoop,
-                           CGlobalMemoryDataOperation,
-                           TailNum>(p_a_grid,
-                                    p_b_grid,
-                                    p_c_grid,
+                           EGlobalMemoryDataOperation,
+                           TailNum>(p_as_grid,
+                                    p_bs_grid,
+                                    p_ds_grid,
+                                    p_e_grid,
                                     p_shared,
-                                    a_grid_desc_ak0_m_ak1,
-                                    b_grid_desc_bk0_n_bk1,
-                                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    as_grid_desc_ak0_m_ak1,
+                                    bs_grid_desc_bk0_n_bk1,
+                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op,
                                     block_m_id,
                                     block_n_id,
                                     num_k_block_per_scale,
@@ -528,17 +633,37 @@ struct GridwiseGemm_wmma_cshuffle_v3
     // Wrapper function to have __global__ function in common
     // between gemm_universal, b_scale, ab_scale, etc.
     template <bool HasMainKBlockLoop,
-              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              TailNumber TailNum = TailNumber::Odd>
+              InMemoryDataOperationEnum EGlobalMemoryDataOperation,
+              TailNumber TailNum>
     __device__ static void
-    Run(void* p_shared, const SplitKBatchOffset& splitk_batch_offset, const Argument& karg)
+    Run(void* p_shared, const SplitKBatchOffset& splitk_batch_offset, Argument& karg)
     {
-        Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        // shift A matrices pointer for splitk
+        AsGridPointer p_as_grid_splitk;
+        static_for<0, NumATensor, 1>{}([&](auto i) {
+            using ADataType_    = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+            p_as_grid_splitk(i) = static_cast<const ADataType_*>(karg.p_as_grid[i]) +
+                                  splitk_batch_offset.a_k_split_offset[i];
+        });
+
+        // shift B matrices pointer for splitk
+        BsGridPointer p_bs_grid_splitk;
+        static_for<0, NumBTensor, 1>{}([&](auto i) {
+            using BDataType_    = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+            p_bs_grid_splitk(i) = static_cast<const BDataType_*>(karg.p_bs_grid[i]) +
+                                  splitk_batch_offset.b_k_split_offset[i];
+        });
+
+        Run<HasMainKBlockLoop, EGlobalMemoryDataOperation, TailNum>(
+            p_as_grid_splitk,
+            p_bs_grid_splitk,
+            karg.p_ds_grid,
+            karg.p_e_grid + splitk_batch_offset.c_reduce_offset,
             p_shared,
-            karg);
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.cde_element_op);
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp
index 37ffbf1c51..46de6b156a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp
@@ -20,15 +20,18 @@ namespace ck {
 
 template <typename ALayout,
           typename BLayout,
-          typename CLayout,
-          typename ADataType,
-          typename BDataType,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename BScaleType,
           typename AccDataType,
           typename CShuffleDataType,
-          typename CDataType,
+          typename DsDataType,
+          typename EDataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
-          typename CElementwiseOperation,
+          typename CDEElementwiseOperation,
           tensor_operation::device::GemmSpecialization GemmSpec,
           index_t BlockSize,
           index_t ScaleBlockN, // scale N
@@ -60,11 +63,11 @@ template <typename ALayout,
           index_t BBlockLdsExtraN,
           index_t CShuffleMRepeatPerShuffle,
           index_t CShuffleNRepeatPerShuffle,
-          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
           BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4,
-          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeA                       = EDataType,
           typename ComputeTypeB                       = ComputeTypeA,
           bool PermuteA                               = false,
           bool PermuteB                               = false>
@@ -72,15 +75,17 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
     : GridwiseGemm_wmma_cshuffle_v3_base<
           ALayout,
           BLayout,
-          CLayout,
-          ADataType,
-          BDataType,
+          DsLayout,
+          ELayout,
+          AsDataType,
+          BsDataType,
           AccDataType,
           CShuffleDataType,
-          CDataType,
+          DsDataType,
+          EDataType,
           AElementwiseOperation,
           BElementwiseOperation,
-          CElementwiseOperation,
+          CDEElementwiseOperation,
           GemmSpec,
           BlockSize,
           MPerBlock,
@@ -110,8 +115,8 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
           BBlockLdsExtraN,
           CShuffleMRepeatPerShuffle,
           CShuffleNRepeatPerShuffle,
-          CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          CShuffleBlockTransferScalarPerVector_NPerBlock,
+          CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          CDEShuffleBlockTransferScalarPerVectors,
           BlkGemmPipeSched,
           BlkGemmPipelineVer,
           ComputeTypeA,
@@ -119,20 +124,20 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
           PermuteA,
           PermuteB>
 {
-    using BScaleType = ck::half_t;
-
     using Base = GridwiseGemm_wmma_cshuffle_v3_base<
         ALayout,
         BLayout,
-        CLayout,
-        ADataType,
-        BDataType,
+        DsLayout,
+        ELayout,
+        AsDataType,
+        BsDataType,
         AccDataType,
         CShuffleDataType,
-        CDataType,
+        DsDataType,
+        EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        CElementwiseOperation,
+        CDEElementwiseOperation,
         GemmSpec,
         BlockSize,
         MPerBlock,
@@ -162,8 +167,8 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
         BBlockLdsExtraN,
         CShuffleMRepeatPerShuffle,
         CShuffleNRepeatPerShuffle,
-        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
         BlkGemmPipeSched,
         BlkGemmPipelineVer,
         ComputeTypeA,
@@ -196,35 +201,46 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
     using Base::CalculateMPadded;
     using Base::CalculateNBlock;
     using Base::CalculateNPadded;
-    using Base::MakeAGridDescriptor_AK0_M_AK1;
-    using Base::MakeBGridDescriptor_BK0_N_BK1;
-    using Base::MakeCGridDescriptor_M_N;
+    using Base::MakeAsGridDescriptor_AK0_M_AK1;
+    using Base::MakeBsGridDescriptor_BK0_N_BK1;
+    using Base::MakeDEGridDescriptor_M_N;
+    using Base::MakeDsGridDescriptor_M_N;
+    using Base::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock;
 
     using Base::GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat;
 
-    using Base::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock;
+    using Base::MakeDEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock;
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
     using Base::GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1;
     using Base::GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1;
 
+    using Base::NumATensor;
+    using Base::NumBTensor;
+    using Base::NumDTensor;
+    using typename Base::AsGridPointer;
+    using typename Base::BsGridPointer;
+    using typename Base::DsGridPointer;
+
     struct Problem
     {
         __host__ Problem(index_t M_,
                          index_t N_,
                          index_t K_,
-                         index_t StrideA_,
-                         index_t StrideB_,
-                         index_t StrideC_,
+                         std::array<index_t, NumATensor> StrideAs_,
+                         std::array<index_t, NumBTensor> StrideBs_,
+                         std::array<index_t, NumDTensor> StrideDs_,
+                         index_t StrideE_,
                          index_t StrideScaleB_,
                          index_t KBatch_)
             : M{M_},
               N{N_},
               K{K_},
-              StrideA{StrideA_},
-              StrideB{StrideB_},
-              StrideC{StrideC_},
+              StrideAs{StrideAs_},
+              StrideBs{StrideBs_},
+              StrideDs{StrideDs_},
+              StrideE{StrideE_},
               StrideScaleB{StrideScaleB_},
               KBatch{KBatch_},
               MPadded{CalculateMPadded(M_)},
@@ -241,19 +257,37 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
         __host__ void Print() const
         {
             std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
-                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
-                      << ", " << "SScaleB:" << StrideScaleB << ", " << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", " << "KRead:" << KRead << ", " << "KP:" << KPadded
-                      << ", " << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", " << "NBlock: " << NBlock << "}" << std::endl;
+                      << "SAs: {";
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                std::cout << StrideAs[i] << (i.value < NumATensor - 1 ? ", " : "");
+            });
+            std::cout << "}, " << "SBs: {";
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                std::cout << StrideBs[i] << (i.value < NumBTensor - 1 ? ", " : "");
+            });
+            std::cout << "}, ";
+            if constexpr(NumDTensor > 0)
+            {
+                std::cout << "SDs: { ";
+                static_for<0, NumDTensor, 1>{}([&](auto i) {
+                    std::cout << StrideDs[i] << (i.value < NumDTensor - 1 ? ", " : "");
+                });
+                std::cout << " }, ";
+            }
+            std::cout << "SE:" << StrideE << ", " << "SScaleB:" << StrideScaleB << ", "
+                      << "MP:" << MPadded << ", " << "NP:" << NPadded << ", " << "KRead:" << KRead
+                      << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0 << ", " << "BK0:" << BK0
+                      << ", " << "MBlock: " << MBlock << ", " << "NBlock: " << NBlock << "}"
+                      << std::endl;
         }
 
         index_t M;
         index_t N;
         index_t K;
-        index_t StrideA;
-        index_t StrideB;
-        index_t StrideC;
+        std::array<index_t, NumATensor> StrideAs;
+        std::array<index_t, NumBTensor> StrideBs;
+        std::array<index_t, NumDTensor> StrideDs;
+        index_t StrideE;
         index_t StrideScaleB;
         index_t KBatch;
         index_t MPadded;
@@ -269,32 +303,64 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
     // Argument
     struct Argument : public tensor_operation::device::BaseArgument, public Problem
     {
-        __host__ Argument(const ADataType* p_a_grid_,
-                          const BDataType* p_b_grid_,
-                          CDataType* p_c_grid_,
+        __host__ Argument(std::array<const void*, NumATensor> p_as_grid_,
+                          std::array<const void*, NumBTensor> p_bs_grid_,
+                          std::array<const void*, NumDTensor> p_ds_grid_,
+                          EDataType* p_e_grid_,
                           index_t M_,
                           index_t N_,
                           index_t K_,
-                          index_t StrideA_,
-                          index_t StrideB_,
-                          index_t StrideC_,
+                          std::array<index_t, NumATensor> StrideAs_,
+                          std::array<index_t, NumBTensor> StrideBs_,
+                          std::array<index_t, NumDTensor> StrideDs_,
+                          index_t StrideE_,
                           index_t StrideScaleB_,
                           const BScaleType* p_b_scale_grid_,
                           index_t k_batch_,
                           AElementwiseOperation a_element_op_,
                           BElementwiseOperation b_element_op_,
-                          CElementwiseOperation c_element_op_,
+                          CDEElementwiseOperation cde_element_op_,
                           bool is_reduce_ = false)
-            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, StrideScaleB_, k_batch_},
-              p_a_grid{p_a_grid_},
-              p_b_grid{p_b_grid_},
-              p_c_grid{p_c_grid_},
+            : Problem{M_,
+                      N_,
+                      K_,
+                      StrideAs_,
+                      StrideBs_,
+                      StrideDs_,
+                      StrideE_,
+                      StrideScaleB_,
+                      k_batch_},
+              p_as_grid{},
+              p_bs_grid{},
+              p_ds_grid{},
+              p_e_grid{p_e_grid_},
               p_b_scale_grid{p_b_scale_grid_},
               a_element_op{a_element_op_},
               b_element_op{b_element_op_},
-              c_element_op{c_element_op_},
+              cde_element_op{cde_element_op_},
               is_reduce(is_reduce_)
         {
+            // populate pointer, desc for As
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                using ADataType_ = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+
+                // A pointer
+                p_as_grid(i) = static_cast<const ADataType_*>(p_as_grid_[i]);
+            });
+
+            // populate pointer, desc for Bs
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                using BDataType_ = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+
+                // B pointer
+                p_bs_grid(i) = static_cast<const BDataType_*>(p_bs_grid_[i]);
+            });
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                p_ds_grid(i) = static_cast<const DDataType*>(p_ds_grid_[i]);
+            });
         }
 
         __host__ __device__ inline bool IsReduceAdd() const
@@ -307,59 +373,70 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
             return (Problem::KBatch > 1) && (!is_reduce);
         }
 
-        const ADataType* p_a_grid;
-        const BDataType* p_b_grid;
-        CDataType* p_c_grid;
+        AsGridPointer p_as_grid;
+        BsGridPointer p_bs_grid;
+        DsGridPointer p_ds_grid;
+        EDataType* p_e_grid;
 
         const BScaleType* p_b_scale_grid;
         const AElementwiseOperation a_element_op;
         const BElementwiseOperation b_element_op;
-        const CElementwiseOperation c_element_op;
+        const CDEElementwiseOperation cde_element_op;
         bool is_reduce;
     };
 
     struct SplitKBatchOffset
     {
 
-        __device__ SplitKBatchOffset(Argument& karg)
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
         {
+            // Note: in xdl implementation multiple AB supports one layout
+            // but multiple strides, so we create an array of offsets with
+            // the same values.
+            // It should be fixed later on. Once we will have a thread transfer
+            // more flexible.
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
             {
-                a_k_split_offset = blockIdx.z * karg.KRead / APackedSize;
+                static_for<0, NumATensor, 1>{}(
+                    [&](auto i) { a_k_split_offset[i] = k_id * karg.KRead / APackedSize; });
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
             {
-                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+                static_for<0, NumATensor, 1>{}(
+                    [&](auto i) { a_k_split_offset[i] = k_id * karg.KRead * karg.StrideAs[i]; });
             }
 
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
             {
-                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+                static_for<0, NumBTensor, 1>{}(
+                    [&](auto i) { b_k_split_offset[i] = k_id * karg.KRead * karg.StrideBs[i]; });
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
             {
                 if constexpr(!PermuteB)
                 {
-                    b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
+                    static_for<0, NumBTensor, 1>{}(
+                        [&](auto i) { b_k_split_offset[i] = k_id * karg.KRead / BPackedSize; });
                 }
                 else
                 {
                     const int k0_offset = karg.KRead * karg.N;
-                    b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
+                    static_for<0, NumBTensor, 1>{}(
+                        [&](auto i) { b_k_split_offset[i] = k_id * k0_offset / BPackedSize; });
                 }
             }
 
             // Calculate B scale offset
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
             {
-                scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK) * karg.StrideB;
+                scale_k_split_offset = k_id * (karg.KRead / ScaleBlockK) * karg.StrideB;
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
             {
-                scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK);
+                scale_k_split_offset = k_id * (karg.KRead / ScaleBlockK);
             }
 
-            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            if(k_id < karg.KBatch - 1)
             {
                 karg.K = karg.KRead;
             }
@@ -370,7 +447,7 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
 
             if(karg.IsReduceAdd())
             {
-                c_reduce_offset = blockIdx.z * karg.M * karg.N;
+                c_reduce_offset = k_id * karg.M * karg.N;
             }
             else
             {
@@ -378,8 +455,8 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
             }
         }
 
-        index_t a_k_split_offset;
-        index_t b_k_split_offset;
+        std::array<index_t, NumATensor> a_k_split_offset;
+        std::array<index_t, NumBTensor> b_k_split_offset;
         index_t scale_k_split_offset; // New member for scale matrix offset
         index_t c_reduce_offset;
     };
@@ -391,7 +468,7 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
     using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
     // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
 
-    template <index_t NumberOfBuffers, typename BScaleGridDesc_BN_AK, typename BScaleType>
+    template <index_t NumberOfBuffers, typename BScaleGridDesc_BN_AK>
     __device__ static auto MakeBScale(const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak,
                                       const BScaleType* p_b_scale_grid,
                                       index_t block_n_id)
@@ -454,24 +531,33 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
     }
 
     template <bool HasMainKBlockLoop,
-              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run(const ADataType* p_a_grid,
-                               const BDataType* p_b_grid,
-                               CDataType* p_c_grid,
+              InMemoryDataOperationEnum EGlobalMemoryDataOperation,
+              TailNumber TailNum>
+    __device__ static void Run(AsGridPointer& p_as_grid,
+                               BsGridPointer& p_bs_grid,
+                               DsGridPointer& p_ds_grid,
+                               EDataType* p_e_grid,
                                const BScaleType* p_b_scale_grid,
                                void* p_shared,
-                               const Problem& problem)
+                               const Problem& problem,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CDEElementwiseOperation cde_element_op)
     {
-        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
-            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
-        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
-            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
-        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
-            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
-        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
-            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const auto as_grid_desc_ak0_m_ak1 = MakeAsGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideAs, problem.AK0);
+        const auto bs_grid_desc_bk0_n_bk1 = MakeBsGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideBs, problem.BK0);
+        const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+        const auto e_grid_desc_m_n = Base::template MakeDEGridDescriptor_M_N<ELayout>(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideE);
+        const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeDEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                e_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
         // B Scale grid
         const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
@@ -487,8 +573,8 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
 
         if(!block_2_ctile_map.ValidCTileIndex(
                block_work_idx,
-               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
-                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+               make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
         {
             return;
         }
@@ -501,19 +587,25 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
 
         const index_t num_k_block_per_scale = GetKBlockPerScale();
 
-        Base::template Run<decltype(a_grid_desc_ak0_m_ak1),
-                           decltype(b_grid_desc_bk0_n_bk1),
-                           decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+        Base::template Run<decltype(as_grid_desc_ak0_m_ak1),
+                           decltype(bs_grid_desc_bk0_n_bk1),
+                           decltype(ds_grid_desc_mblock_mperblock_nblock_nperblock),
+                           decltype(e_grid_desc_mblock_mperblock_nblock_nperblock),
                            decltype(b_scale_struct),
                            HasMainKBlockLoop,
-                           CGlobalMemoryDataOperation,
-                           TailNum>(p_a_grid,
-                                    p_b_grid,
-                                    p_c_grid,
+                           EGlobalMemoryDataOperation,
+                           TailNum>(p_as_grid,
+                                    p_bs_grid,
+                                    p_ds_grid,
+                                    p_e_grid,
                                     p_shared,
-                                    a_grid_desc_ak0_m_ak1,
-                                    b_grid_desc_bk0_n_bk1,
-                                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    as_grid_desc_ak0_m_ak1,
+                                    bs_grid_desc_bk0_n_bk1,
+                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op,
                                     block_m_id,
                                     block_n_id,
                                     num_k_block_per_scale,
@@ -523,18 +615,38 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
     // NOTE: Wrapper function to have __global__ function in common
     // between gemm_universal, b_scale, ab_scale, etc.
     template <bool HasMainKBlockLoop,
-              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              TailNumber TailNum = TailNumber::Odd>
+              InMemoryDataOperationEnum EGlobalMemoryDataOperation,
+              TailNumber TailNum>
     __device__ static void
-    Run(void* p_shared, const SplitKBatchOffset& splitk_batch_offset, const Argument& karg)
+    Run(void* p_shared, const SplitKBatchOffset& splitk_batch_offset, Argument& karg)
     {
-        Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        // shift A matrices pointer for splitk
+        AsGridPointer p_as_grid_splitk;
+        static_for<0, NumATensor, 1>{}([&](auto i) {
+            using ADataType_    = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+            p_as_grid_splitk(i) = static_cast<const ADataType_*>(karg.p_as_grid[i]) +
+                                  splitk_batch_offset.a_k_split_offset[i];
+        });
+
+        // shift B matrices pointer for splitk
+        BsGridPointer p_bs_grid_splitk;
+        static_for<0, NumBTensor, 1>{}([&](auto i) {
+            using BDataType_    = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+            p_bs_grid_splitk(i) = static_cast<const BDataType_*>(karg.p_bs_grid[i]) +
+                                  splitk_batch_offset.b_k_split_offset[i];
+        });
+
+        Run<HasMainKBlockLoop, EGlobalMemoryDataOperation, TailNum>(
+            p_as_grid_splitk,
+            p_bs_grid_splitk,
+            karg.p_ds_grid,
+            karg.p_e_grid + splitk_batch_offset.c_reduce_offset,
             karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
             p_shared,
-            karg);
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.cde_element_op);
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
index c60dba3b48..dac0c9b3b0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
@@ -3,6 +3,11 @@
 
 #pragma once
 
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
+#include <iostream>
+#include <ostream>
+#endif
+
 #include "ck/utility/env.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
@@ -11,7 +16,8 @@
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
-#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
@@ -19,7 +25,7 @@ namespace ck {
 
 template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          InMemoryDataOperationEnum EGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
@@ -31,17 +37,17 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #if(defined(__gfx11__) || defined(__gfx12__))
 #if defined(__gfx11__)
     // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
-    using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
-    if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
-                   (std::is_same_v<c_data_type, ck::half_t> ||
-                    std::is_same_v<c_data_type, ck::bhalf_t>)))
+    using e_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_e_grid)>>;
+    if constexpr(!(EGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
+                   (std::is_same_v<e_data_type, ck::half_t> ||
+                    std::is_same_v<e_data_type, ck::bhalf_t>)))
     {
 #endif
         __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        GridwiseGemm::template Run<HasMainKBlockLoop, EGlobalMemoryDataOperation, TailNum>(
             p_shared, splitk_batch_offset, karg);
 
 #if defined(__gfx11__)
@@ -54,15 +60,17 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 
 template <typename ALayout,
           typename BLayout,
-          typename CLayout,
-          typename ADataType,
-          typename BDataType,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
           typename AccDataType,
           typename CShuffleDataType,
-          typename CDataType,
+          typename DsDataType,
+          typename EDataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
-          typename CElementwiseOperation,
+          typename CDEElementwiseOperation,
           tensor_operation::device::GemmSpecialization GemmSpec,
           index_t BlockSize,
           index_t MPerBlock,
@@ -92,8 +100,8 @@ template <typename ALayout,
           index_t BBlockLdsExtraN,
           index_t CShuffleMRepeatPerShuffle,
           index_t CShuffleNRepeatPerShuffle,
-          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
           BlockGemmPipelineScheduler BlkGemmPipeSched,
           BlockGemmPipelineVersion BlkGemmPipelineVer,
           typename ComputeTypeA,
@@ -112,6 +120,21 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
     static constexpr auto I6 = Number<6>{};
     static constexpr auto I7 = Number<7>{};
 
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+
+    using LDSTypeA =
+        typename std::conditional<(NumATensor > 1),
+                                  ComputeTypeA,
+                                  remove_cvref_t<tuple_element_t<0, AsDataType>>>::type;
+    using LDSTypeB =
+        typename std::conditional<(NumBTensor > 1),
+                                  ComputeTypeB,
+                                  remove_cvref_t<tuple_element_t<0, BsDataType>>>::type;
+
+    static constexpr auto EShuffleBlockTransferScalarPerVector =
+        CDEShuffleBlockTransferScalarPerVectors{}[I0];
+
     // K1 should be Number<...>
     static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
     static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
@@ -126,14 +149,14 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
     static constexpr index_t APackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+        if constexpr(is_same_v<remove_cvref_t<LDSTypeA>, pk_i4_t>)
             return 2;
         else
             return 1;
     }();
 
     static constexpr index_t BPackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+        if constexpr(is_same_v<remove_cvref_t<LDSTypeB>, pk_i4_t>)
             return 2;
         else
             return 1;
@@ -220,6 +243,31 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
             make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
     }
 
+    static constexpr auto MakeAsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using ADataType_ = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+
+                return static_cast<const ADataType_*>(nullptr);
+            },
+            Number<NumATensor>{});
+    }
+
+    static constexpr auto MakeBsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using BDataType_ = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+
+                return static_cast<const BDataType_*>(nullptr);
+            },
+            Number<NumBTensor>{});
+    }
+
+    using AsGridPointer = decltype(MakeAsGridPointer());
+    using BsGridPointer = decltype(MakeBsGridPointer());
+
     __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
         index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
     {
@@ -304,6 +352,21 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
         }
     }
 
+    __host__ __device__ static auto
+    MakeAsGridDescriptor_AK0_M_AK1(const index_t M,
+                                   const index_t MPad,
+                                   const index_t K,
+                                   const index_t KPad,
+                                   const std::array<index_t, NumATensor>& StrideAs,
+                                   const index_t AK0)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeAGridDescriptor_AK0_M_AK1(M, MPad, K, KPad, StrideAs[i], AK0);
+            },
+            Number<NumATensor>{});
+    }
+
     __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
         index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
     {
@@ -320,7 +383,7 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
 
         using GemmSpecialization = tensor_operation::device::GemmSpecialization;
 
-        static_assert(!(is_same_v<remove_cvref_t<BDataType>, pk_i4_t> &&
+        static_assert(!(is_same_v<remove_cvref_t<LDSTypeB>, pk_i4_t> &&
                         GemmSpec != GemmSpecialization::Default),
                       "pk_i4_t does not support padding");
 
@@ -414,6 +477,21 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
         }
     }
 
+    __host__ __device__ static auto
+    MakeBsGridDescriptor_BK0_N_BK1(const index_t K,
+                                   const index_t KPad,
+                                   const index_t N,
+                                   const index_t NPad,
+                                   const std::array<index_t, NumBTensor>& StrideBs,
+                                   const index_t BK0)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeBGridDescriptor_BK0_N_BK1(K, KPad, N, NPad, StrideBs[i], BK0);
+            },
+            Number<NumBTensor>{});
+    }
+
     template <typename ABlockDesc_AK0_M_AK1>
     __host__ __device__ static constexpr auto MakeAWmmaTileDescriptor(const ABlockDesc_AK0_M_AK1&)
     {
@@ -430,17 +508,18 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
         return MakeWmmaTileDescriptor<NRepeat, NWaves, NPerWmma>(BBlockDesc_BK0_N_BK1{});
     }
 
+    template <typename DELayout>
     __host__ __device__ static auto
-    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    MakeDEGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideDE)
     {
         const auto c_grid_desc_mraw_nraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, DELayout>::value)
             {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideDE, I1));
             }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, DELayout>::value)
             {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideDE));
             }
         }();
 
@@ -493,6 +572,44 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
 #endif
     }
 
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    __host__ __device__ static auto MakeDsGridDescriptor_M_N(
+        index_t M, index_t MPad, index_t N, index_t NPad, std::array<index_t, NumDTensor> StrideDs)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                return MakeDEGridDescriptor_M_N<DLayout>(M, MPad, N, NPad, StrideDs[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    template <typename DsGridDesc>
+    __device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DsGridDesc& ds_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeDEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n[i], MBlock, NBlock);
+            },
+            Number<NumDTensor>{});
+    }
+
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
         // A matrix in LDS memory, dst of blockwise copy
@@ -508,7 +625,7 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
         // in some cases.
         else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
         {
-            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize;
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(LDSTypeA) / APackedSize;
             constexpr auto MLdsLayer        = LdsSize < 1 ? 1 : LdsSize;
             constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
                 make_tuple(
@@ -555,20 +672,20 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
             constexpr auto KThreadRead      = 64 / MPerWmma;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
-            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+            constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
                                        ? 1
-                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+                                       : 128 / (AK1Number * M0 * sizeof(LDSTypeA));
             constexpr auto KThreadReadPerm =
                 (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
                     ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
                     : KThreadRead;
 
             // 1<=mpair<=n0
-            constexpr auto mpair = (AK1Number * MPerWmma * sizeof(ADataType) > 128)
+            constexpr auto mpair = (AK1Number * MPerWmma * sizeof(LDSTypeA) > 128)
                                        ? 1
-                                       : ((128 / (AK1Number * MPerWmma * sizeof(ADataType))) > M0
+                                       : ((128 / (AK1Number * MPerWmma * sizeof(LDSTypeA))) > M0
                                               ? M0
-                                              : 128 / (AK1Number * MPerWmma * sizeof(ADataType)));
+                                              : 128 / (AK1Number * MPerWmma * sizeof(LDSTypeA)));
 
             constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
                 make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
@@ -645,7 +762,7 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
         else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
         {
             // NLdsLayer * K0 as logical Bank
-            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize;
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(LDSTypeB) / BPackedSize;
             constexpr index_t NLdsLayer     = LdsSize < 1 ? 1 : LdsSize;
             constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
                 make_tuple(
@@ -689,20 +806,20 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
             constexpr auto KThreadRead      = 64 / NPerWmma;
             constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
 
-            constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
+            constexpr auto kfold = (BK1Number * N0 * sizeof(LDSTypeB) > 128)
                                        ? 1
-                                       : 128 / (BK1Number * N0 * sizeof(BDataType));
+                                       : 128 / (BK1Number * N0 * sizeof(LDSTypeB));
             constexpr auto KThreadReadPerm =
                 (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
                     ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
                     : KThreadRead;
 
             // 1<=npair<=n0
-            constexpr auto npair = (BK1Number * NPerWmma * sizeof(BDataType) > 128)
+            constexpr auto npair = (BK1Number * NPerWmma * sizeof(LDSTypeB) > 128)
                                        ? 1
-                                       : ((128 / (BK1Number * NPerWmma * sizeof(BDataType))) > N0
+                                       : ((128 / (BK1Number * NPerWmma * sizeof(LDSTypeB))) > N0
                                               ? N0
-                                              : 128 / (BK1Number * NPerWmma * sizeof(BDataType)));
+                                              : 128 / (BK1Number * NPerWmma * sizeof(LDSTypeB)));
 
             constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
                 make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
@@ -787,8 +904,8 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
                  BlkGemmPipelineVer,
                  BlkGemmPipeSched,
                  BlockSize,
-                 ADataType,
-                 BDataType,
+                 LDSTypeA,
+                 LDSTypeB,
                  ComputeTypeA,
                  ComputeTypeB,
                  AccDataType,
@@ -805,18 +922,18 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
                  NRepeat,
                  KPack>())>;
 
-    template <typename CGridDesc>
-    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    template <typename DEGridDesc>
+    __device__ static constexpr auto MakeDEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DEGridDesc& de_grid_desc_m_n, index_t MBlock, index_t NBlock)
     {
-        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
-            c_grid_desc_m_n,
+        const auto de_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            de_grid_desc_m_n,
             make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
                        make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
             make_tuple(Sequence<0>{}, Sequence<1>{}),
             make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
 
-        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+        return de_grid_desc_mblock_mperblock_nblock_nperblock;
     }
 
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
@@ -950,56 +1067,51 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
             }
         }
 
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
         {
-            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            if(karg.N % EShuffleBlockTransferScalarPerVector != 0)
             {
                 if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
                 {
                     std::cout << "Arg N (" << karg.N
                               << ") value is not a multiple of "
-                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
-                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
+                                 "EShuffleBlockTransferScalarPerVector ("
+                              << EShuffleBlockTransferScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
                 }
                 return false;
             }
         }
         else
         {
-            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            if(karg.M % EShuffleBlockTransferScalarPerVector != 0)
             {
                 if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
                 {
                     std::cout << "Arg M (" << karg.M
                               << ") value is not a multiple of "
-                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
-                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
+                                 "EShuffleBlockTransferScalarPerVector ("
+                              << EShuffleBlockTransferScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
                 }
                 return false;
             }
         }
 
-        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
-                       is_same<remove_cvref_t<CDataType>, float>::value ||
-                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
-                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
+        if constexpr(!(is_same<remove_cvref_t<EDataType>, half_t>::value ||
+                       is_same<remove_cvref_t<EDataType>, float>::value ||
+                       is_same<remove_cvref_t<EDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<EDataType>, int32_t>::value))
         {
-            if(!karg.IsReduceAdd())
+            if(karg.IsAtomicAdd() && karg.KBatch > 1)
             {
                 if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
                 {
-                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not supported yet"
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                if(karg.KBatch > 1)
-                {
-                    return false;
+                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not supported for this "
+                              << "destination type (EDataType) " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
                 }
+                return false;
             }
         }
 
@@ -1010,6 +1122,27 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
         {
             if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Pipeline validation failed: num_k_loop (" << num_k_loop
+                              << ") <= PrefetchStages (" << BlockwiseGemmPipe::PrefetchStages
+                              << ") for pipeline version != v1." << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<remove_cvref_t<EDataType>, int8_t>::value)
+        {
+            if(karg.KBatch > 1)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "int8_t does not support KBatch > 1. KBatch: " << karg.KBatch
+                              << " " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
                 return false;
             }
         }
@@ -1055,41 +1188,74 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
             c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat
                 .GetElementSpaceSize();
 
-        return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize +
-                          b_block_space_size_aligned * sizeof(BDataType) / BPackedSize),
+        return math::max((a_block_space_size_aligned * sizeof(LDSTypeA) / APackedSize +
+                          b_block_space_size_aligned * sizeof(LDSTypeB) / BPackedSize),
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    template <index_t numElements, typename Type>
+    __device__ __forceinline__ static auto get_first_element_workaround(Type& array)
+    {
+        if constexpr(numElements > 1)
+        {
+            return array;
+        }
+        else
+        {
+            return array[I0];
+        }
+    }
+
     template <typename AGridDesc_AK0_M_K1,
               typename BGridDesc_BK0_N_K1,
-              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              typename DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
               typename BScaleStruct,
               bool HasMainKBlockLoop,
-              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              InMemoryDataOperationEnum EGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run(const ADataType* p_a_grid,
-                               const BDataType* p_b_grid,
-                               CDataType* p_c_grid,
+    __device__ static void Run(AsGridPointer p_as_grid,
+                               BsGridPointer p_bs_grid,
+                               DsGridPointer p_ds_grid,
+                               EDataType* p_e_grid,
                                void* p_shared,
-                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
-                               const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
-                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
-                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const AGridDesc_AK0_M_K1& as_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_K1& bs_grid_desc_bk0_n_bk1,
+                               const DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   e_grid_desc_mblock_mperblock_nblock_nperblock,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CDEElementwiseOperation cde_element_op,
                                const index_t& block_m_id,
                                const index_t& block_n_id,
                                const index_t& num_k_block_per_scale,
                                BScaleStruct& b_scale_struct)
     {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+        const auto as_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_as_grid[i], as_grid_desc_ak0_m_ak1[i].GetElementSpaceSize());
+            },
+            Number<NumATensor>{});
 
-        const AElementwiseOperation a_element_op{};
-        const BElementwiseOperation b_element_op{};
-        const CElementwiseOperation c_element_op{};
+        const auto bs_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_bs_grid[i], bs_grid_desc_bk0_n_bk1[i].GetElementSpaceSize());
+            },
+            Number<NumBTensor>{});
+
+        const auto ds_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_ds_grid[i],
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize());
+            },
+            Number<NumDTensor>{});
+        auto e_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
         // HACK: this force m/n_block_data_idx_on_grid into SGPR
         const index_t m_block_data_idx_on_grid =
@@ -1108,66 +1274,144 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
 
         // A matrix blockwise copy
-        auto a_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
-                                                AElementwiseOperation,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<AK0Number, MPerBlock, AK1Number>,
-                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ADataType,
-                                                ADataType,
-                                                decltype(a_grid_desc_ak0_m_ak1),
-                                                decltype(a_block_desc_ak0_m_ak1),
-                                                ABlockTransferSrcAccessOrder,
-                                                Sequence<0, 1, 2>,
-                                                ABlockTransferSrcVectorDim,
-                                                2,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_AK1,
-                                                1,
-                                                1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                true,
-                                                BlockwiseGemmPipe::GlobalBufferNum>(
-                a_grid_desc_ak0_m_ak1,
-                make_multi_index(0, m_block_data_idx_on_grid, 0),
-                a_element_op,
-                a_block_desc_ak0_m_ak1,
-                make_multi_index(0, 0, 0),
-                ck::tensor_operation::element_wise::PassThrough{});
+        // workaround because v7r2 is not as general as v4r1
+        auto get_a_blockwise_transfer = [&]() {
+            if constexpr(NumATensor > 1)
+            {
+                const auto idx_as_block_begin = generate_tuple(
+                    [&](auto) { return make_multi_index(0, m_block_data_idx_on_grid, 0); },
+                    Number<NumATensor>{});
+
+                return ThreadGroupTensorSliceTransfer_v7r2<
+                    ThisThreadBlock,
+                    AsDataType,
+                    Tuple<LDSTypeA>,
+                    AGridDesc_AK0_M_K1,
+                    decltype(tie(a_block_desc_ak0_m_ak1)),
+                    AElementwiseOperation,
+                    Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
+                    Sequence<AK0Number, MPerBlock, AK1Number>,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    ABlockTransferSrcAccessOrder,
+                    Sequence<1, 0, 2>,
+                    ABlockTransferSrcVectorDim,
+                    2,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    uniform_sequence_gen_t<NumATensor, AThreadTransferSrcResetCoordinateAfterRun>,
+                    Sequence<true>,
+                    BlockwiseGemmPipe::GlobalBufferNum>{as_grid_desc_ak0_m_ak1,
+                                                        idx_as_block_begin,
+                                                        tie(a_block_desc_ak0_m_ak1),
+                                                        make_tuple(make_multi_index(0, 0, 0)),
+                                                        a_element_op};
+            }
+            else
+            {
+                return ThreadGroupTensorSliceTransfer_v4r1<
+                    ThisThreadBlock,
+                    AElementwiseOperation,
+                    ck::tensor_operation::element_wise::PassThrough,
+                    InMemoryDataOperationEnum::Set,
+                    Sequence<AK0Number, MPerBlock, AK1Number>,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    remove_cvref_t<tuple_element_t<0, AsDataType>>,
+                    remove_cvref_t<tuple_element_t<0, AsDataType>>,
+                    decltype(as_grid_desc_ak0_m_ak1[I0]),
+                    decltype(a_block_desc_ak0_m_ak1),
+                    ABlockTransferSrcAccessOrder,
+                    Sequence<0, 1, 2>,
+                    ABlockTransferSrcVectorDim,
+                    2,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    1,
+                    1,
+                    AThreadTransferSrcResetCoordinateAfterRun,
+                    true,
+                    BlockwiseGemmPipe::GlobalBufferNum>(
+                    as_grid_desc_ak0_m_ak1[I0],
+                    make_multi_index(0, m_block_data_idx_on_grid, 0),
+                    a_element_op,
+                    a_block_desc_ak0_m_ak1,
+                    make_multi_index(0, 0, 0),
+                    ck::tensor_operation::element_wise::PassThrough{});
+            }
+        };
+
+        auto a_blockwise_copy = get_a_blockwise_transfer();
 
         // B matrix blockwise copy
-        auto b_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
-                                                BElementwiseOperation,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<BK0Number, NPerBlock, BK1Number>,
-                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BDataType,
-                                                BDataType,
-                                                decltype(b_grid_desc_bk0_n_bk1),
-                                                decltype(b_block_desc_bk0_n_bk1),
-                                                BBlockTransferSrcAccessOrder,
-                                                Sequence<0, 1, 2>,
-                                                BBlockTransferSrcVectorDim,
-                                                2,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_BK1,
-                                                1,
-                                                1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                true,
-                                                BlockwiseGemmPipe::GlobalBufferNum>(
-                b_grid_desc_bk0_n_bk1,
-                make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_element_op,
-                b_block_desc_bk0_n_bk1,
-                make_multi_index(0, 0, 0),
-                ck::tensor_operation::element_wise::PassThrough{});
+        // workaround because v7r2 is not as general as v4r1
+        auto get_b_blockwise_transfer = [&]() {
+            if constexpr(NumBTensor > 1)
+            {
+                const auto idx_bs_block_begin = generate_tuple(
+                    [&](auto) { return make_multi_index(0, n_block_data_idx_on_grid, 0); },
+                    Number<NumBTensor>{});
+
+                return ThreadGroupTensorSliceTransfer_v7r2<
+                    ThisThreadBlock,
+                    BsDataType,
+                    Tuple<LDSTypeB>,
+                    BGridDesc_BK0_N_K1,
+                    decltype(tie(b_block_desc_bk0_n_bk1)),
+                    BElementwiseOperation,
+                    Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
+                    Sequence<BK0Number, NPerBlock, BK1Number>,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    BBlockTransferSrcAccessOrder,
+                    Sequence<1, 0, 2>,
+                    BBlockTransferSrcVectorDim,
+                    2,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    uniform_sequence_gen_t<NumBTensor, BThreadTransferSrcResetCoordinateAfterRun>,
+                    Sequence<true>,
+                    BlockwiseGemmPipe::GlobalBufferNum>{bs_grid_desc_bk0_n_bk1,
+                                                        idx_bs_block_begin,
+                                                        tie(b_block_desc_bk0_n_bk1),
+                                                        make_tuple(make_multi_index(0, 0, 0)),
+                                                        b_element_op};
+            }
+            else
+            {
+                return ThreadGroupTensorSliceTransfer_v4r1<
+                    ThisThreadBlock,
+                    BElementwiseOperation,
+                    ck::tensor_operation::element_wise::PassThrough,
+                    InMemoryDataOperationEnum::Set,
+                    Sequence<BK0Number, NPerBlock, BK1Number>,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    remove_cvref_t<tuple_element_t<0, BsDataType>>,
+                    remove_cvref_t<tuple_element_t<0, BsDataType>>,
+                    decltype(bs_grid_desc_bk0_n_bk1[I0]),
+                    decltype(b_block_desc_bk0_n_bk1),
+                    BBlockTransferSrcAccessOrder,
+                    Sequence<0, 1, 2>,
+                    BBlockTransferSrcVectorDim,
+                    2,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    1,
+                    1,
+                    BThreadTransferSrcResetCoordinateAfterRun,
+                    true,
+                    BlockwiseGemmPipe::GlobalBufferNum>(
+                    bs_grid_desc_bk0_n_bk1[I0],
+                    make_multi_index(0, n_block_data_idx_on_grid, 0),
+                    b_element_op,
+                    b_block_desc_bk0_n_bk1,
+                    make_multi_index(0, 0, 0),
+                    ck::tensor_operation::element_wise::PassThrough{});
+            }
+        };
+
+        auto b_blockwise_copy = get_b_blockwise_transfer();
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
@@ -1175,12 +1419,12 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
 
         // Cast after lds
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<LDSTypeA*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
 
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
-                                                                            sizeof(ADataType) /
-                                                                            APackedSize),
+            reinterpret_cast<LDSTypeB*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
+                                                                           sizeof(LDSTypeA) /
+                                                                           APackedSize),
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
@@ -1192,25 +1436,26 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
         auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
 
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
-            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            (as_grid_desc_ak0_m_ak1[I0].GetLength(I0) * as_grid_desc_ak0_m_ak1[I0].GetLength(I2)) /
             KPerBlock);
 
-        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
-                                                                         a_block_desc_ak0_m_ak1,
-                                                                         a_blockwise_copy,
-                                                                         a_grid_buf,
-                                                                         a_block_buf,
-                                                                         a_block_slice_copy_step,
-                                                                         b_grid_desc_bk0_n_bk1,
-                                                                         b_block_desc_bk0_n_bk1,
-                                                                         b_blockwise_copy,
-                                                                         b_grid_buf,
-                                                                         b_block_buf,
-                                                                         b_block_slice_copy_step,
-                                                                         c_thread_buf,
-                                                                         b_scale_struct,
-                                                                         num_k_block_main_loop,
-                                                                         num_k_block_per_scale);
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+            get_first_element_workaround<NumATensor>(as_grid_desc_ak0_m_ak1),
+            a_block_desc_ak0_m_ak1,
+            a_blockwise_copy,
+            get_first_element_workaround<NumATensor>(as_grid_buf),
+            a_block_buf,
+            a_block_slice_copy_step,
+            get_first_element_workaround<NumBTensor>(bs_grid_desc_bk0_n_bk1),
+            b_block_desc_bk0_n_bk1,
+            b_blockwise_copy,
+            get_first_element_workaround<NumBTensor>(bs_grid_buf),
+            b_block_buf,
+            b_block_slice_copy_step,
+            c_thread_buf,
+            b_scale_struct,
+            num_k_block_main_loop,
+            num_k_block_per_scale);
 
         // shuffle C and write out
         {
@@ -1330,31 +1575,58 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
                                  m_thread_data_on_block_idx[I3]),
                 ck::tensor_operation::element_wise::PassThrough{}};
 
-            // shuffle: blockwise copy C from LDS to global
-            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
-                ThisThreadBlock,            // ThreadGroup
-                CElementwiseOperation,      // ElementwiseOperation,
-                CGlobalMemoryDataOperation, // DstInMemOp,
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat),
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor buffers
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin = container_concat(
+                make_tuple(make_multi_index(0, 0, 0, 0)),
+                generate_tuple([&](auto) { return make_multi_index(block_m_id, 0, block_n_id, 0); },
+                               Number<NumDTensor>{}));
+
+            // blockwise copy which loads C from LDS, D from global, applies elementwise
+            // operation and stores result E to global
+            auto cde_shuffle_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3<
+                ThisThreadBlock, // ThreadGroup
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CDEElementwiseOperation,                                    // ElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // DstInMemOps,
                 Sequence<1,
                          CShuffleMRepeatPerShuffle * MWave * MPerWmma,
                          1,
                          CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths,
-                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                CShuffleDataType,     // typename SrcData,
-                CDataType,            // typename DstData,
-                decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat),
-                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
-                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
-                3,                                              // index_t VectorDim,
-                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
-                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
-                false> // bool ThreadTransferDstResetCoordinateAfterRun>
-                {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
-                 make_multi_index(0, 0, 0, 0),
-                 c_grid_desc_mblock_mperblock_nblock_nperblock,
-                 make_multi_index(block_m_id, 0, block_n_id, 0),
-                 c_element_op};
+                CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>,                    // ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>,                    // SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>,                    // DstDimAccessOrder,
+                3,                                       // SrcVectorDim,
+                3,                                       // DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors, // SrcScalarPerVectors
+                EShuffleBlockTransferScalarPerVector,    // DstScalarPerVector
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>>                    // ThreadTransferDstResetCoordinateAfterRunFlags
+                {c_ds_desc_refs,
+                 idx_c_ds_block_begin,
+                 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                 make_tuple(make_multi_index(block_m_id, 0, block_n_id, 0)),
+                 cde_element_op};
 
             // space filling curve for local reg & global memory
             // space filling curve for threadwise C in VGPR
@@ -1370,7 +1642,7 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
                                            MAccVgprs>>{};
 
             // space filling curve for shuffled blockwise C in global mem
-            constexpr auto sfc_c_global =
+            constexpr auto sfc_cde_global =
                 SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
                                   Sequence<0, 2, 1, 3>,
                                   Sequence<1,
@@ -1380,7 +1652,7 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
 
             constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
 
-            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+            static_assert(num_access == sfc_cde_global.GetNumOfAccess(), "wrong!");
 
             static_for<0, num_access, 1>{}([&](auto access_id) {
                 // make sure it's safe to write to LDS
@@ -1397,20 +1669,26 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
                 // make sure it's safe to read from LDS
                 block_sync_lds();
 
-                // each block copy its data from LDS to global
-                c_shuffle_block_copy_lds_to_global.Run(
-                    c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
-                    c_shuffle_block_buf,
-                    c_grid_desc_mblock_mperblock_nblock_nperblock,
-                    c_grid_buf);
+                // each block loads its C data from LDS, D from global, applies elementwise
+                // operation and stores result E to global
+                cde_shuffle_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(e_grid_buf));
 
                 if constexpr(access_id < num_access - 1)
                 {
-                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+                    constexpr auto cde_global_step = sfc_cde_global.GetForwardStep(access_id);
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_shuffle_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_global_step);
+                    });
 
-                    // move on C
-                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
-                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                    // move on E
+                    cde_shuffle_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock), cde_global_step);
                 }
             });
         }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
index 68112489ca..3940c42c20 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
@@ -269,6 +269,9 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM)
         {
@@ -325,7 +328,7 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
@@ -406,6 +409,9 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN)
         {
@@ -459,7 +465,7 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
             constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
 
             constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
@@ -553,6 +559,8 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3
         return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     using BlockwiseGemmPipe =
         remove_cvref_t<decltype(BlockGemmPipeline_Selector<
                                 BlkGemmPipelineVer,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index 129929b665..3e91f120b1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -37,11 +37,14 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg, karg.p_workspace_);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg, karg.p_workspace_);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -58,20 +61,23 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid,
-        karg.p_b_grid,
-        karg.p_c_grid,
-        p_shared_0,
-        p_shared_1,
-        karg,
-        karg.p_workspace_);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid,
+            karg.p_b_grid,
+            karg.p_c_grid,
+            p_shared_0,
+            p_shared_1,
+            karg,
+            karg.p_workspace_);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -656,6 +662,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM)
         {
@@ -712,7 +721,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
@@ -793,6 +802,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN)
         {
@@ -846,7 +858,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
             constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
 
             constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
@@ -1006,6 +1018,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index e4d5b99ffe..4cd1a587e9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -24,11 +24,15 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
     kernel_gemm_xdl_cshuffle_v1(typename GridwiseGemm::Argument karg)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(
-        karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
@@ -48,10 +52,15 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                 FloatC* __restrict__ p_c_grid,
                                 typename GridwiseGemm::Problem problem)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid, p_b_grid, p_c_grid, p_shared, problem);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid, p_b_grid, p_c_grid, p_shared, problem);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -542,6 +551,22 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                          c_block_size * sizeof(FloatCShuffle));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Problem& problem)
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
index 57624b218c..ccba4d4a94 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -25,14 +25,18 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, 1)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v2(typename GridwiseGemm::Argument karg)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, TailNum>(
-        karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared_0, p_shared_1, karg);
+        GridwiseGemm::template Run<HasMainKBlockLoop, TailNum>(
+            karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared_0, p_shared_1, karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
@@ -52,12 +56,16 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, 1)
                                 FloatC* p_c_grid,
                                 typename GridwiseGemm::Problem problem)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(
-        p_a_grid, p_b_grid, p_c_grid, p_shared_0, p_shared_1, problem);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid, p_b_grid, p_c_grid, p_shared_0, p_shared_1, problem);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -581,6 +589,22 @@ struct GridwiseGemm_xdl_cshuffle_v2
                          c_block_size * sizeof(FloatCShuffle));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Problem& problem)
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 4c07d60b0f..a6e4870ac7 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -818,10 +818,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
-        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-        constexpr index_t NWaves = (NXdlPerWave * NPerXdl == 0) ? 0 : NPerBlock / (NXdlPerWave * NPerXdl);
-        constexpr index_t WaveSize = (MWaves * NWaves == 0) ? 64 : BlockSize / (MWaves * NWaves);
-
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);   
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -960,9 +959,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
-        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-        constexpr index_t NWaves = (NXdlPerWave * NPerXdl == 0) ? 0 : NPerBlock / (NXdlPerWave * NPerXdl);
-        constexpr index_t WaveSize = (MWaves * NWaves == 0) ? 64 : BlockSize / (MWaves * NWaves);
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -1188,74 +1187,23 @@ struct GridwiseGemm_xdl_cshuffle_v3
             return false;
         }
 
-        // Check tile size
-#if defined(__gfx11__) || defined(__gfx12__)
-        if constexpr(MPerXdl != 16 || NPerXdl != 16)
-        {
-            return false;
-        }
-#endif
-        // Check atomic caps
-#if defined(__gfx11__)
-        constexpr bool SupportMemOp = CGlobalMemoryDataOperation == InMemoryDataOperationEnum::Set;
-#else
-        constexpr bool SupportMemOp = sizeof(CDataType) >= 2 || (CGlobalMemoryDataOperation ==
-                                                                 InMemoryDataOperationEnum::Set);
-#endif
-        if constexpr(SupportMemOp == false)
-        {
-            return false;
-        }
-
-        // Check tile size
-        if constexpr(MXdlPerWave > 0 && NXdlPerWave > 0)
-        {
-            constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-            constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
-            if constexpr(MWaves > 0 && NWaves > 0)
-            {
-                constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
-                if constexpr(WaveSize == get_warp_size())
-                {
-                    return true;
-                }
-                else
-                {
-                    return false;
-                }
-            }
-            else
-            {
-                return false;
-            }
-        }
-        else
-        {
-            return false;
-        }
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+                   BlockSize,
+                   MPerBlock,
+                   NPerBlock,
+                   MPerXdl,
+                   NPerXdl,
+                   MXdlPerWave,
+                   NXdlPerWave,
+                   CDataType,
+                   CGlobalMemoryDataOperation>();
     }
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
-        if constexpr((MPerXdl * MXdlPerWave) == 0 || (NXdlPerWave * NPerXdl) == 0)
-        {
-            return false;
-        }
-        else
-        {
-            if constexpr((MPerBlock % (MPerXdl * MXdlPerWave) != 0) ||
-                         (NPerBlock % (NXdlPerWave * NPerXdl) != 0))
-            {
-                return false;
-            }
-            else
-            {
-                if(BlockwiseGemmPipe::WaveSize != get_warp_size())
-                {
-                    return false;
-                }
-            }
-        }
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
 
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index 7947d2490a..78546c4f99 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -35,17 +35,20 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_b_preshuffle(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared,
-        karg);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -63,21 +66,24 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared_0,
-        p_shared_1,
-        karg);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared_0,
+            p_shared_1,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -347,7 +353,9 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack>{};
+        constexpr index_t MWave           = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize        = BlockSize / (MWave * NWave);
+        constexpr index_t NkSwizzleNumber = Number<WaveSize * KPack>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -567,9 +575,7 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
               AK0{CalculateAK0Padded(K_, KBatch_)},
               BK0{CalculateBK0Padded(K_, KBatch_)},
               MBlock{CalculateMBlock(M_)},
-              NBlock{CalculateNBlock(N_)},
-              BN0Shuffled{CalculateBN0Shuffled(N_)},
-              BK0Shuffled{CalculateBK0Shuffled(K_)}
+              NBlock{CalculateNBlock(N_)}
         {
         }
 
@@ -598,9 +604,6 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
         index_t BK0;
         index_t MBlock;
         index_t NBlock;
-        // For B pre-shuffle only
-        index_t BN0Shuffled;
-        index_t BK0Shuffled;
     };
 
     // Argument
@@ -700,6 +703,8 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM)
         {
@@ -735,7 +740,7 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
@@ -886,6 +891,8 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -1460,10 +1467,12 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
                                void* p_shared,
                                const Problem& problem)
     {
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
@@ -1842,10 +1851,12 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
                                     void* p_shared_1,
                                     const Problem& problem)
     {
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
index a7d7546b1c..36141bc96f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -35,19 +35,21 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
-        p_shared,
-        karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
+            p_shared,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -65,23 +67,25 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
-        p_shared_0,
-        p_shared_1,
-        karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
+            p_shared_0,
+            p_shared_1,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -709,6 +713,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -764,7 +771,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
@@ -845,6 +852,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -896,7 +906,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
             constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
 
             constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
@@ -1043,6 +1053,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -1444,12 +1456,18 @@ struct GridwiseGemm_xdl_cshuffle_v3
         constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
 
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
-
-        auto b_thread_offset_n =
-            get_thread_local_1d_id() % NPerXdl + (get_thread_local_1d_id() / 64) % NWaves * NPerXdl;
-        auto b_thread_offset_k = (get_thread_local_1d_id() % 64) / NPerXdl * KPerThread;
-
+        constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
+#if defined(__gfx11__)
+        auto b_thread_offset_n = get_thread_local_1d_id() % NPerXdl +
+                                 (get_thread_local_1d_id() / WaveSize) % NWaves * NPerXdl;
+        auto b_thread_offset_k = (get_thread_local_1d_id() % 16) / NPerXdl * KPerThread;
+#else
+        auto b_thread_offset_n = get_thread_local_1d_id() % NPerXdl +
+                                 (get_thread_local_1d_id() / WaveSize) % NWaves * NPerXdl;
+        auto b_thread_offset_k = (get_thread_local_1d_id() % WaveSize) / NPerXdl * KPerThread;
+#endif
         auto b_scale_thread_copy =
             ThreadwiseTensorSliceTransfer_v2<BScaleType,
                                              BScaleType,
@@ -1916,11 +1934,18 @@ struct GridwiseGemm_xdl_cshuffle_v3
         constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
 
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
-
-        auto b_thread_offset_n =
-            get_thread_local_1d_id() % NPerXdl + (get_thread_local_1d_id() / 64) % NWaves * NPerXdl;
-        auto b_thread_offset_k = (get_thread_local_1d_id() % 64) / NPerXdl * KPerThread;
+        constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
+#if defined(__gfx11__)
+        auto b_thread_offset_n = get_thread_local_1d_id() % NPerXdl +
+                                 (get_thread_local_1d_id() / WaveSize) % NWaves * NPerXdl;
+        auto b_thread_offset_k = (get_thread_local_1d_id() % 16) / NPerXdl * KPerThread;
+#else
+        auto b_thread_offset_n = get_thread_local_1d_id() % NPerXdl +
+                                 (get_thread_local_1d_id() / WaveSize) % NWaves * NPerXdl;
+        auto b_thread_offset_k = (get_thread_local_1d_id() % WaveSize) / NPerXdl * KPerThread;
+#endif
 
         auto b_scale_thread_copy =
             ThreadwiseTensorSliceTransfer_v2<BScaleType,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
index 1187088bb6..35c8c6c3b4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
@@ -34,19 +34,22 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_as_grid,
-        karg.p_bs_grid,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_as_grid,
+            karg.p_bs_grid,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -64,23 +67,26 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_as_grid,
-        karg.p_bs_grid,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared_0,
-        p_shared_1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_as_grid,
+            karg.p_bs_grid,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared_0,
+            p_shared_1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -769,6 +775,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM)
         {
@@ -825,7 +834,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
@@ -906,6 +915,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN)
         {
@@ -959,7 +971,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
             constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
 
             constexpr auto kfold = (BK1Number * N0 * sizeof(LDSTypeB) > 128)
@@ -1106,6 +1118,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index b72c4d0313..5b19ff8542 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -38,21 +38,24 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -70,25 +73,28 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_2lds(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared_0,
-        p_shared_1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared_0,
+            p_shared_1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -689,6 +695,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -747,7 +756,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
@@ -828,6 +837,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -883,7 +895,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
             constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
 
             constexpr auto kfold = (BK1Number * N0 * sizeof(LDSTypeB) > 128)
@@ -1030,6 +1042,47 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        constexpr bool valid = ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            CDataType,
+            CGlobalMemoryDataOperation_>();
+        if constexpr(!valid)
+        {
+            return false;
+        }
+
+        using MfmaInst = MfmaSelector<ComputeTypeA,
+                                      MPerXdl,
+                                      NPerXdl,
+                                      ComputeTypeB,
+                                      is_single_rate_mfma,
+                                      is_scale_mfma>;
+
+        constexpr index_t KPerThread =
+            KPerBlock / (MfmaInst::GetKPerXdlops() / MfmaInst::GetK1PerXdlops());
+        if constexpr(KPerThread % KPack != 0)
+        {
+            static_assert(0);
+            return false;
+        }
+
+        if constexpr(NXdlPerWave % CShuffleNXdlPerWavePerShuffle != 0)
+        {
+            return false;
+        }
+        return true;
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -1037,6 +1090,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                           (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
+        if constexpr(NXdlPerWave % CShuffleNXdlPerWavePerShuffle != 0)
+        {
+            return false;
+        }
+
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index e80a3702fb..8119cace3b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -38,21 +38,24 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid,
-        karg.p_b_grid,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        karg.p_a_scale_grid,
-        karg.p_b_scale_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid,
+            karg.p_b_grid,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            karg.p_a_scale_grid,
+            karg.p_b_scale_grid,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
@@ -443,7 +446,8 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
     __host__ __device__ static constexpr auto
     MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
     {
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t NWaves =
+            NXdlPerWave * NPerXdl == 0 ? 1 : NPerBlock / (NXdlPerWave * NPerXdl);
 
         return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NPerXdl>(BBlockDesc_BK0_N_BK1{});
     }
@@ -685,6 +689,9 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM)
         {
@@ -720,7 +727,7 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
@@ -801,6 +808,9 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN)
         {
@@ -831,7 +841,7 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
             constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
 
             constexpr auto kfold = (BK1Number * N0 * sizeof(LDSTypeB) > 128)
@@ -978,6 +988,8 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -1358,10 +1370,11 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
 
-        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
-        auto a_thread_offset =
-            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+        constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
+        auto a_thread_offset       = get_thread_local_1d_id() % MPerXdl +
+                               (get_thread_local_1d_id() / WaveSize) / NWaves * MPerXdl;
 
         constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index 373d4eb4e4..2e95ec0d52 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -38,21 +38,24 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -70,23 +73,26 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        p_shared1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared,
+            p_shared1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -375,7 +381,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPackPerGroup>{};
+        constexpr index_t MWave           = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize        = BlockSize / (MWave * NWave);
+        constexpr index_t NkSwizzleNumber = Number<WaveSize * KPackPerGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -591,9 +599,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
               AK0{CalculateAK0Padded(K_, KBatch_)},
               BK0{CalculateBK0Padded(K_, KBatch_)},
               MBlock{CalculateMBlock(M_)},
-              NBlock{CalculateNBlock(N_)},
-              BN0Shuffled{CalculateBN0Shuffled(N_)},
-              BK0Shuffled{CalculateBK0Shuffled(K_)}
+              NBlock{CalculateNBlock(N_)}
         {
         }
 
@@ -604,7 +610,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                       << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
                       << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
                       << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+                      << "NBlock: " << NBlock << " }" << std::endl;
         }
 
         index_t M;
@@ -623,9 +629,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
         index_t BK0;
         index_t MBlock;
         index_t NBlock;
-        // FOR PRESHUFFLE ONLY
-        index_t BN0Shuffled;
-        index_t BK0Shuffled;
     };
 
     // Argument
@@ -714,6 +717,8 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM)
         {
@@ -749,7 +754,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
@@ -896,6 +901,46 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        constexpr bool valid = ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            CDataType,
+            CGlobalMemoryDataOperation_>();
+        if constexpr(!valid)
+        {
+            return false;
+        }
+
+        using MfmaInst = MfmaSelector<ComputeTypeA,
+                                      MPerXdl,
+                                      NPerXdl,
+                                      ComputeTypeB,
+                                      is_single_rate_mfma,
+                                      is_scale_mfma>;
+
+        constexpr index_t KPerThread =
+            KPerBlock / (MfmaInst::GetKPerXdlops() / MfmaInst::GetK1PerXdlops());
+        if constexpr(KPerThread % KPack != 0)
+        {
+            return false;
+        }
+
+        if constexpr(NXdlPerWave % CShuffleNXdlPerWavePerShuffle != 0)
+        {
+            return false;
+        }
+        return true;
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -903,6 +948,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                           (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
+        if constexpr(NXdlPerWave % CShuffleNXdlPerWavePerShuffle != 0)
+        {
+            return false;
+        }
+
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
@@ -1144,12 +1194,15 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                                CElementwiseOperation c_element_op,
                                const Block2CTileMap& block_2_ctile_map)
     {
-        ignore                           = b_element_op;
+        ignore              = b_element_op;
+        index_t BN0Shuffled = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled = CalculateBK0Shuffled(problem.K);
+
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
 
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
 
@@ -1578,11 +1631,13 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                                     const Block2CTileMap& block_2_ctile_map)
     {
         ignore                           = b_element_op;
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
 
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
index e345bc860b..bf7ae1c6e8 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
@@ -39,21 +39,24 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle(
         typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid,
-        karg.p_b_grid,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        karg.p_a_scale_grid,
-        karg.p_b_scale_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid,
+            karg.p_b_grid,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            karg.p_a_scale_grid,
+            karg.p_b_scale_grid,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
@@ -72,23 +75,26 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds(
         typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid,
-        karg.p_b_grid,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        karg.p_a_scale_grid,
-        karg.p_b_scale_grid,
-        p_shared,
-        p_shared1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid,
+            karg.p_b_grid,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            karg.p_a_scale_grid,
+            karg.p_b_scale_grid,
+            p_shared,
+            p_shared1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
@@ -370,7 +376,9 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack / KGroup>{};
+        constexpr index_t MWave           = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize        = BlockSize / (MWave * NWave);
+        constexpr index_t NkSwizzleNumber = Number<WaveSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -547,9 +555,7 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
               AK0{CalculateAK0Padded(K_, KBatch_)},
               BK0{CalculateBK0Padded(K_, KBatch_)},
               MBlock{CalculateMBlock(M_)},
-              NBlock{CalculateNBlock(N_)},
-              BN0Shuffled{CalculateBN0Shuffled(N_)},
-              BK0Shuffled{CalculateBK0Shuffled(K_)}
+              NBlock{CalculateNBlock(N_)}
         {
         }
 
@@ -579,9 +585,6 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
         index_t BK0;
         index_t MBlock;
         index_t NBlock;
-        // FOR PRESHUFFLE ONLY
-        index_t BN0Shuffled;
-        index_t BK0Shuffled;
     };
 
     // Argument
@@ -676,6 +679,8 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM)
         {
@@ -711,7 +716,7 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
@@ -861,6 +866,8 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -1085,11 +1092,13 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
                                CElementwiseOperation c_element_op)
     {
         ignore                           = b_element_op;
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
 
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
 
@@ -1227,10 +1236,12 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
         constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
 
-        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
-        auto a_thread_offset =
-            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+        constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
+
+        auto a_thread_offset = get_thread_local_1d_id() % MPerXdl +
+                               (get_thread_local_1d_id() / WaveSize) / NWaves * MPerXdl;
 
         constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
@@ -1580,10 +1591,12 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
                                     CElementwiseOperation c_element_op)
     {
         ignore                           = b_element_op;
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
 
@@ -1727,10 +1740,11 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
         constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
 
-        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
-        auto a_thread_offset =
-            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+        constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
+        auto a_thread_offset       = get_thread_local_1d_id() % MPerXdl +
+                               (get_thread_local_1d_id() / WaveSize) / NWaves * MPerXdl;
 
         constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index bc87559c43..1a356c372d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -39,20 +39,22 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx950__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared,
-        karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -71,24 +73,26 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx950__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared_0,
-        p_shared_1,
-        karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared_0,
+            p_shared_1,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -807,6 +811,10 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
+
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -838,9 +846,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             // kfold and mpair dimension is not always required.
             // more dimension in merge_transform increase the difficulty of generating immarg offset
             // for compiler.
-            constexpr auto WaveSize = 64;
-            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-            constexpr auto M1       = MPerBlock / M0;
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
@@ -925,6 +932,9 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -952,9 +962,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         }
         else // RowMajor B
         {
-            constexpr auto WaveSize = 64;
-            constexpr auto N0       = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
-            constexpr auto N1       = NPerBlock / N0;
+            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1 = NPerBlock / N0;
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
@@ -1108,6 +1117,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
index 7902a16fb3..3d2ef9b6c4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
@@ -39,20 +39,22 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx950__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared,
-        karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -71,24 +73,26 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx950__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared_0,
-        p_shared_1,
-        karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared_0,
+            p_shared_1,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -422,7 +426,9 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack>{};
+        constexpr index_t MWave           = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize        = BlockSize / (MWave * NWave);
+        constexpr index_t NkSwizzleNumber = Number<WaveSize * KPack>{};
         return make_naive_tensor_descriptor_packed(
             make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber));
     }
@@ -666,9 +672,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
               AK0{CalculateAK0Padded(K_, KBatch_)},
               BK0{CalculateBK0Padded(K_, KBatch_)},
               MBlock{CalculateMBlock(M_)},
-              NBlock{CalculateNBlock(N_)},
-              BN0Shuffled{CalculateBN0Shuffled(N_)},
-              BK0Shuffled{CalculateBK0Shuffled(K_)}
+              NBlock{CalculateNBlock(N_)}
         {
         }
 
@@ -700,9 +704,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
         index_t BK0;
         index_t MBlock;
         index_t NBlock;
-        // FOR PRESHUFFLE ONLY
-        index_t BN0Shuffled;
-        index_t BK0Shuffled;
     };
 
     // Argument
@@ -836,6 +837,9 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
+
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -867,9 +871,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
             // kfold and mpair dimension is not always required.
             // more dimension in merge_transform increase the difficulty of generating immarg offset
             // for compiler.
-            constexpr auto WaveSize = 64;
-            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-            constexpr auto M1       = MPerBlock / M0;
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
@@ -1028,6 +1031,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -2271,10 +2276,12 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
                                     void* p_shared_1,
                                     const Problem& problem)
     {
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
         const auto b_grid_desc_bk0_n_bk1 =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
 
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
index e90239b70a..e4152e0427 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -57,28 +57,31 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    // TODO ANT: separate into MMA + Epilogue
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  p_c0_bias_grid,
-                                                  p_c0_add_grid,
-                                                  p_c0_gamma_grid,
-                                                  p_c0_beta_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  acc_element_op,
-                                                  c_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  c0_grid_desc_nblock_nperblock,
-                                                  block_2_ctile_map);
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
+        // TODO ANT: separate into MMA + Epilogue
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                      p_b_grid,
+                                                      p_c_grid,
+                                                      p_c0_bias_grid,
+                                                      p_c0_add_grid,
+                                                      p_c0_gamma_grid,
+                                                      p_c0_beta_grid,
+                                                      p_shared,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      acc_element_op,
+                                                      c_element_op,
+                                                      a_grid_desc_ak0_m_ak1,
+                                                      b_grid_desc_bk0_n_bk1,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      c0_grid_desc_nblock_nperblock,
+                                                      block_2_ctile_map);
+    }
     // TODO ANT: Run layernorm epilogue here
 #else
     ignore = p_a_grid;
@@ -243,6 +246,22 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                              c_lds_workspace_size * sizeof(FloatReduceAcc));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
index 50363d832e..67f18de12f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -77,6 +77,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
     static constexpr auto BK1         = Number<BK1Value>{};
     static constexpr auto AK0PerBlock = Number<KPerBlock / AK1Value>{};
     static constexpr auto BK0PerBlock = Number<KPerBlock / BK1Value>{};
+    static constexpr auto BlockSize   = math::max(TileLoadThreadGroupSize, TileMathThreadGroupSize);
 
     struct TileLoadThreadGroup
     {
@@ -171,6 +172,22 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
                          c_block_size * sizeof(EDataTypeShuffle));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            EDataType,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2ETileMap>
     __host__ __device__ static constexpr bool
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index 344c7d6528..abb8c52e0f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -166,20 +166,24 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                   const CElementwiseOperation c_element_op,
                                   const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  p_shared,
-                                                  a_b_k0_m_k1_grid_desc,
-                                                  b_b_k0_n_k1_grid_desc,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  c_block_cluster_adaptor);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                      p_b_grid,
+                                                      p_c_grid,
+                                                      p_shared,
+                                                      a_b_k0_m_k1_grid_desc,
+                                                      b_b_k0_n_k1_grid_desc,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op,
+                                                      c_block_cluster_adaptor);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -209,8 +213,8 @@ template <index_t BlockSize,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
+          index_t MPerXdl,
+          index_t NPerXdl,
           index_t K1Value,
           index_t MRepeat,
           index_t NRepeat,
@@ -519,6 +523,21 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                          c_block_size * sizeof(FloatC));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MRepeat,
+            NRepeat,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
@@ -530,8 +549,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
         static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
                       "wrong! K1 need to be known at compile-time");
 
-        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
-                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
+        static_assert((MPerBlock % (MPerXdl * MRepeat) == 0) &&
+                          (NPerBlock % (NRepeat * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
         const auto M      = a_b_k0_m_k1_grid_desc.GetLength(I2);
@@ -604,14 +623,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
     __host__ __device__ static constexpr auto
     GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
     {
-        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXdl);
 
         return make_naive_tensor_descriptor_packed(
             make_tuple(I1,
-                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXDL>{},
+                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXdl>{},
                        I1,
-                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
+                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXdl>{}));
     }
 
     using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
@@ -751,8 +770,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(K1,
                                             MfmaSelector<FloatAAdjusted,
-                                                              MPerXDL,
-                                                              NPerXDL,
+                                                              MPerXdl,
+                                                              NPerXdl,
                                                               FloatBAdjusted,
                                                               is_single_rate_mfma,
                                                               is_scale_mfma>::selected_mfma.k_per_blk);
@@ -764,8 +783,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                                 FloatAcc,
                                                                 decltype(a_k0_m_k1_block_desc),
                                                                 decltype(b_k0_n_k1_block_desc),
-                                                                MPerXDL,
-                                                                NPerXDL,
+                                                                MPerXdl,
+                                                                NPerXdl,
                                                                 MRepeat,
                                                                 NRepeat,
                                                                 KPack>{};
@@ -807,8 +826,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
 
         // output: register to global memory
         {
-            constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-            constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+            constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NRepeat * NPerXdl);
 
             constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc =
                 blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
@@ -834,8 +853,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
 
             static_assert(M1 == MWave, "");
             static_assert(N1 == NWave, "");
-            static_assert(M2 * M3 * M4 == MPerXDL, "");
-            static_assert(N2 == NPerXDL, "");
+            static_assert(M2 * M3 * M4 == MPerXdl, "");
+            static_assert(N2 == NPerXdl, "");
 
             constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
                 c_block_desc_mblock_mperblock_nblock_nperblock,
@@ -845,11 +864,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                       M1,
                                                       M2,
                                                       M3,
-                                                      M4)), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                                                      M4)), // M1 = MWave, M2 * M3 * M4 = MPerXdl
                     make_freeze_transform(I0),              // freeze nblock
                     make_unmerge_transform(make_tuple(CShuffleNRepeatPerShuffle,
                                                       N1,
-                                                      N2))), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                                                      N2))), // M1 = MWave, M2 * M3 * M4 = MPerXdl
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                 make_tuple(
                     Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
@@ -920,9 +939,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                 CElementwiseOperation,      // ElementwiseOperation,
                 CGlobalMemoryDataOperation, // DstInMemOp,
                 Sequence<1,
-                         CShuffleMRepeatPerShuffle * MWave * MPerXDL,
+                         CShuffleMRepeatPerShuffle * MWave * MPerXdl,
                          1,
-                         CShuffleNRepeatPerShuffle * NWave * NPerXDL>, // BlockSliceLengths,
+                         CShuffleNRepeatPerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
                 CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                 Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                 FloatC,               // typename SrcData,
@@ -941,11 +960,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                  c_element_op};
 
             constexpr auto mxdlperwave_forward_step =
-                make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXDL, 0, 0);
+                make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXdl, 0, 0);
             constexpr auto nxdlperwave_forward_step =
-                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXdl);
             constexpr auto nxdlperwave_backward_step =
-                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXdl);
 
             static_for<0, MRepeat, CShuffleMRepeatPerShuffle>{}([&](auto mxdlperwave_iter) {
                 constexpr auto mxdlperwave = mxdlperwave_iter;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
index 24fe81c74e..cf3040d1ae 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -21,8 +21,7 @@ template <typename GridwiseGemm,
           typename FloatC,
           typename AGridDesc_K0_M_K1,
           typename BGridDesc_K0_N_K1,
-          typename BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3,
-          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
+          typename CGridDesc_M_N,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
@@ -32,39 +31,48 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
 __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_gemm_xdlops_skip_b_lds_v1(
-        const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        FloatC* __restrict__ p_c_grid,
-        const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-        const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
-        const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-        const AElementwiseOperation a_element_op,
-        const BElementwiseOperation b_element_op,
-        const CElementwiseOperation c_element_op,
-        const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_xdlops_skip_b_lds_v1(const FloatAB* __restrict__ p_a_grid,
+                                     const FloatAB* __restrict__ p_b_grid,
+                                     FloatC* __restrict__ p_c_grid,
+                                     const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+                                     const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+                                     const CGridDesc_M_N c_grid_desc_m_n,
+                                     const AElementwiseOperation a_element_op,
+                                     const BElementwiseOperation b_element_op,
+                                     const CElementwiseOperation c_element_op,
+                                     const Block2CTileMap block_2_ctile_map)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,
-                                                   p_b_grid,
-                                                   p_c_grid,
-                                                   p_shared,
-                                                   a_grid_desc_k0_m_k1,
-                                                   b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
-                                                   c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                                   a_element_op,
-                                                   b_element_op,
-                                                   c_element_op,
-                                                   block_2_ctile_map);
+        auto c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+            GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n);
+
+        auto b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3 =
+            GridwiseGemm::MakeBGridDescriptor_K0_K1_K2_N0_N1_N2_N3_K3(b_grid_desc_k0_n_k1);
+
+        GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,
+                                                       p_b_grid,
+                                                       p_c_grid,
+                                                       p_shared,
+                                                       a_grid_desc_k0_m_k1,
+                                                       b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+                                                       c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                       a_element_op,
+                                                       b_element_op,
+                                                       c_element_op,
+                                                       block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
     ignore = p_c_grid;
     ignore = a_grid_desc_k0_m_k1;
-    ignore = b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3;
-    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_m_n;
     ignore = a_element_op;
     ignore = b_element_op;
     ignore = c_element_op;
@@ -86,8 +94,8 @@ template <index_t BlockSize,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
+          index_t MPerXdl,
+          index_t NPerXdl,
           index_t K1Value,
           index_t MXdlPerWave,
           index_t NXdlPerWave,
@@ -119,11 +127,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
     // K1 should be Number<...>
     static constexpr auto K1 = Number<K1Value>{};
 
-    static constexpr index_t WaveSize = 64;
-    static constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXDL);
-    static constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXDL);
+    static constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXdl);
+    static constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXdl);
+    static constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
 
-    static constexpr auto xdlops_gemm    = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, K1>{};
+    static constexpr auto xdlops_gemm    = XdlopsGemm<FloatAB, MPerXdl, NPerXdl, K1>{};
     static constexpr index_t K0PerThread = K0PerBlock / xdlops_gemm.K0PerXdlops;
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
@@ -164,6 +172,21 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
         return (a_block_space_size_aligned) * sizeof(FloatAB);
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ __device__ static constexpr bool
     CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
@@ -175,8 +198,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
         static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
                       "wrong! K1 need to be known at compile-time");
 
-        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
-                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
         const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
@@ -242,7 +265,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
             make_tuple(make_unmerge_transform(
                            make_tuple(K0 / K0PerBlock, xdlops_gemm.K0PerXdlops, K0PerThread)),
                        make_unmerge_transform(make_tuple(
-                           N / (NXdlPerWave * NWaves * NPerXDL), NXdlPerWave, NWaves, NPerXDL)),
+                           N / (NXdlPerWave * NWaves * NPerXdl), NXdlPerWave, NWaves, NPerXdl)),
                        make_pass_through_transform(K1)),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
             make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5, 6>{}, Sequence<7>{}));
@@ -264,7 +287,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
     __device__ static auto GetWaveKNIdx(const index_t thread_id)
     {
         constexpr auto wave_threadid_to_nk_idx_adaptor = make_single_stage_tensor_adaptor(
-            make_tuple(make_merge_transform(make_tuple(xdlops_gemm.K0PerXdlops, NPerXDL))),
+            make_tuple(make_merge_transform(make_tuple(xdlops_gemm.K0PerXdlops, NPerXdl))),
             make_tuple(Sequence<0, 1>{}),
             make_tuple(Sequence<0>{}));
 
@@ -311,8 +334,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
             MPerBlock,
             NPerBlock,
             K0PerBlock,
-            MPerXDL,
-            NPerXDL,
+            MPerXdl,
+            NPerXdl,
             MXdlPerWave,
             NXdlPerWave,
             K1>;
@@ -356,20 +379,19 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
         return cblockid_to_m0_n0_block_cluster_adaptor;
     }
 
-    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
-        decltype(MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
     using DefaultBlock2CTileMap = decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1));
-    using BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 =
-        decltype(MakeBGridDescriptor_K0_K1_K2_N0_N1_N2_N3_K3(BGridDesc_K0_N_K1{}));
 
-    template <bool HasMainK0BlockLoop, typename Block2CTileMap = DefaultBlock2CTileMap>
+    template <bool HasMainK0BlockLoop,
+              typename BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3,
+              typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
+              typename Block2CTileMap = DefaultBlock2CTileMap>
     __device__ static void
     Run(const FloatAB* __restrict__ p_a_grid,
         const FloatAB* __restrict__ p_b_grid,
         FloatC* __restrict__ p_c_grid,
         void* __restrict__ p_shared,
         const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
-        const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+        const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3& b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
         const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2& c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
         const AElementwiseOperation& a_element_op,
         const BElementwiseOperation& b_element_op,
@@ -510,8 +532,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
             MPerBlock,
             NPerBlock,
             K0PerBlock,
-            MPerXDL,
-            NPerXDL,
+            MPerXdl,
+            NPerXdl,
             MXdlPerWave,
             NXdlPerWave,
             K1>{};
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
index a13ce732e6..c5f60a7413 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -39,12 +39,15 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                               const CElementwiseOperation c_element_op)
 {
 #if defined(__gfx9__)
-    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
 
-    __shared__ uint8_t p_shared[shared_size];
+        __shared__ uint8_t p_shared[shared_size];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
-        karg, static_cast<void*>(p_shared), b2c_map, a_element_op, b_element_op, c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
+            karg, static_cast<void*>(p_shared), b2c_map, a_element_op, b_element_op, c_element_op);
+    }
 #else
     ignore = karg;
     ignore = b2c_map;
@@ -70,8 +73,8 @@ template <index_t BlockSize,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
+          index_t MPerXdl,
+          index_t NPerXdl,
           index_t K1Value,
           index_t MRepeat,
           index_t NRepeat,
@@ -394,6 +397,10 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
                          c_block_size * sizeof(FloatC));
     }
 
+    static constexpr index_t MXdlPerWave = MRepeat;
+    static constexpr index_t NXdlPerWave = NRepeat;
+    IS_VALID_COMPILATION_PARAMETER_IMPL(FloatC)
+
     __host__ __device__ static constexpr bool CheckValidity(const Argument& karg)
     {
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
@@ -520,14 +527,14 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
     __host__ __device__ static constexpr auto
     GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
     {
-        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXdl);
 
         return make_naive_tensor_descriptor_packed(
             make_tuple(I1,
-                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXDL>{},
+                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXdl>{},
                        I1,
-                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
+                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXdl>{}));
     }
 
     // return block_id to C matrix tile idx (m0, n0, k_split) mapping
@@ -711,8 +718,8 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
             FloatAcc,
             decltype(a_k0_m_k1_block_desc),
             decltype(b_k0_n_k1_block_desc),
-            MPerXDL,
-            NPerXDL,
+            MPerXdl,
+            NPerXdl,
             MRepeat,
             NRepeat,
             K1,
@@ -766,8 +773,8 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
 
         // output: register to global memory
         {
-            constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-            constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+            constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NRepeat * NPerXdl);
 
             constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc =
                 blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
@@ -799,11 +806,11 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
                                                       M1,
                                                       M2,
                                                       M3,
-                                                      M4)), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                                                      M4)), // M1 = MWave, M2 * M3 * M4 = MPerXdl
                     make_freeze_transform(I0),              // freeze nblock
                     make_unmerge_transform(make_tuple(CShuffleNRepeatPerShuffle,
                                                       N1,
-                                                      N2))), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                                                      N2))), // M1 = MWave, M2 * M3 * M4 = MPerXdl
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                 make_tuple(
                     Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
@@ -874,9 +881,9 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
                 CElementwiseOperation,      // ElementwiseOperation,
                 CGlobalMemoryDataOperation, // DstInMemOp,
                 Sequence<1,
-                         CShuffleMRepeatPerShuffle * MWave * MPerXDL,
+                         CShuffleMRepeatPerShuffle * MWave * MPerXdl,
                          1,
-                         CShuffleNRepeatPerShuffle * NWave * NPerXDL>, // BlockSliceLengths,
+                         CShuffleNRepeatPerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
                 CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                 Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                 FloatC,               // typename SrcData,
@@ -895,11 +902,11 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
                  c_element_op};
 
             constexpr auto mxdlperwave_forward_step =
-                make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXDL, 0, 0);
+                make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXdl, 0, 0);
             constexpr auto nxdlperwave_forward_step =
-                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXdl);
             constexpr auto nxdlperwave_backward_step =
-                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXdl);
 
             static_for<0, MRepeat, CShuffleMRepeatPerShuffle>{}([&](auto mxdlperwave_iter) {
                 constexpr auto mxdlperwave = mxdlperwave_iter;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
index 6aa61fcd38..a040409a6d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -37,23 +37,26 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                index_t StrideC,
                                typename GridwiseGemm::Block2CTileMap block_mapping)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
 
-    __shared__ uint8_t p_shared[shared_size];
+        __shared__ uint8_t p_shared[shared_size];
 
-    GridwiseGemm::Run(p_a_grid,
-                      p_b_grid,
-                      p_c_grid,
-                      p_workspace,
-                      M,
-                      N,
-                      K,
-                      StrideA,
-                      StrideB,
-                      StrideC,
-                      block_mapping,
-                      static_cast<void*>(p_shared));
+        GridwiseGemm::Run(p_a_grid,
+                          p_b_grid,
+                          p_c_grid,
+                          p_workspace,
+                          M,
+                          N,
+                          K,
+                          StrideA,
+                          StrideB,
+                          StrideC,
+                          block_mapping,
+                          static_cast<void*>(p_shared));
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -83,8 +86,8 @@ template <index_t BlockSize,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
+          index_t MPerXdl,
+          index_t NPerXdl,
           index_t K1Value,
           index_t MRepeat,
           index_t NRepeat,
@@ -305,6 +308,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
                          c_block_size * sizeof(FloatCShuffle));
     }
 
+    static constexpr index_t MXdlPerWave = MRepeat;
+    static constexpr index_t NXdlPerWave = NRepeat;
+    IS_VALID_COMPILATION_PARAMETER_IMPL(FloatC)
+
     __host__ __device__ static constexpr bool CheckValidity(const Argument& karg)
     {
         if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
@@ -380,27 +387,27 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
     __host__ __device__ static constexpr auto
     GetCBlockDescriptor_MBlock_MPerShuffle_NBlock_NPerShuffle()
     {
-        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
+        constexpr index_t NWave = NRepeat * NPerXdl == 0 ? 1 : NPerBlock / (NRepeat * NPerXdl);
 
         return make_naive_tensor_descriptor_packed(
             make_tuple(I1,
-                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXDL>{},
+                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXdl>{},
                        I1,
-                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
+                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXdl>{}));
     }
 
     __host__ __device__ static constexpr auto
     GetCBlockDescriptor_MShuffleRepeat_MPerShuffle_NShuffleRepeat_NPerShuffle()
     {
-        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
+        constexpr index_t NWave = NRepeat * NPerXdl == 0 ? 1 : NPerBlock / (NRepeat * NPerXdl);
 
         return make_naive_tensor_descriptor_packed(
             make_tuple(Number<MRepeat / CShuffleMRepeatPerShuffle>{},
-                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXDL>{},
+                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXdl>{},
                        Number<NRepeat / CShuffleNRepeatPerShuffle>{},
-                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
+                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXdl>{}));
     }
 
     __host__ __device__ static constexpr auto GetClusterLengthReduction()
@@ -490,8 +497,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
                                                                 FloatAcc,
                                                                 decltype(a_block_desc_k0_m_k1),
                                                                 decltype(b_block_desc_k0_n_k1),
-                                                                MPerXDL,
-                                                                NPerXDL,
+                                                                MPerXdl,
+                                                                NPerXdl,
                                                                 MRepeat,
                                                                 NRepeat,
                                                                 K1>{};
@@ -829,8 +836,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
 
             // output: register to global memory
             {
-                constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-                constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+                constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
+                constexpr index_t NWave = NPerBlock / (NRepeat * NPerXdl);
 
                 constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc =
                     blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
@@ -871,12 +878,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
                                               M1,
                                               M2,
                                               M3,
-                                              M4)),       // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                                              M4)),       // M1 = MWave, M2 * M3 * M4 = MPerXdl
                                make_freeze_transform(I0), // freeze nblock
                                make_unmerge_transform(
                                    make_tuple(CShuffleNRepeatPerShuffle,
                                               N1,
-                                              N2))), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                                              N2))), // M1 = MWave, M2 * M3 * M4 = MPerXdl
                     make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                     make_tuple(Sequence<>{},
                                Sequence<0, 2, 4, 5, 6>{},
@@ -948,9 +955,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
                     CElementwiseOperation, // ElementwiseOperation,
                                            // InMemoryDataOperationEnum::Set, // DstInMemOp,
                     Sequence<1,
-                             CShuffleMRepeatPerShuffle * MWave * MPerXDL,
+                             CShuffleMRepeatPerShuffle * MWave * MPerXdl,
                              1,
-                             CShuffleNRepeatPerShuffle * NWave * NPerXDL>, // BlockSliceLengths,
+                             CShuffleNRepeatPerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
                     CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                     Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                     FloatCShuffle,        // typename SrcData,
@@ -977,9 +984,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
                     CElementwiseOperation, // ElementwiseOperation,
                                            // InMemoryDataOperationEnum::Set, // DstInMemOp,
                     Sequence<1,
-                             CShuffleMRepeatPerShuffle * MWave * MPerXDL,
+                             CShuffleMRepeatPerShuffle * MWave * MPerXdl,
                              1,
-                             CShuffleNRepeatPerShuffle * NWave * NPerXDL>, // BlockSliceLengths,
+                             CShuffleNRepeatPerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
                     CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                     Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                     FloatCShuffle,        // typename SrcData,
@@ -1000,11 +1007,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
                      c_element_op};
 
                 constexpr auto mxdlperwave_forward_step =
-                    make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXDL, 0, 0);
+                    make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXdl, 0, 0);
                 constexpr auto nxdlperwave_forward_step =
-                    make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+                    make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXdl);
                 constexpr auto nxdlperwave_backward_step =
-                    make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+                    make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXdl);
 
                 static_for<0, MRepeat, CShuffleMRepeatPerShuffle>{}([&](auto mxdlperwave_iter) {
                     constexpr auto mxdlperwave = mxdlperwave_iter;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index ae9a8af813..d2418c0913 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -38,16 +38,20 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
                             const CGridDesc_M_N c_grid_desc_m_n)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  p_shared,
-                                                  a_grid_desc_k0_m_k1,
-                                                  b_grid_desc_k0_n_k1,
-                                                  c_grid_desc_m_n);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                      p_b_grid,
+                                                      p_c_grid,
+                                                      p_shared,
+                                                      a_grid_desc_k0_m_k1,
+                                                      b_grid_desc_k0_n_k1,
+                                                      c_grid_desc_m_n);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -68,25 +72,29 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
     kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    const auto a_grid_desc_k0_m_k1 =
-        amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(
-            karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
-    const auto b_grid_desc_k0_n_k1 =
-        amd_wave_read_first_lane(GridwiseGemm::MakeBGridDescriptor_K0_N_K1(
-            karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
-    const auto c_grid_desc_m_n = amd_wave_read_first_lane(GridwiseGemm::MakeCGridDescriptor_M_N(
-        karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
+        const auto a_grid_desc_k0_m_k1 =
+            amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(
+                karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
+        const auto b_grid_desc_k0_n_k1 =
+            amd_wave_read_first_lane(GridwiseGemm::MakeBGridDescriptor_K0_N_K1(
+                karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
+        const auto c_grid_desc_m_n = amd_wave_read_first_lane(GridwiseGemm::MakeCGridDescriptor_M_N(
+            karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid,
-                                                  karg.p_b_grid,
-                                                  karg.p_c_grid,
-                                                  p_shared,
-                                                  a_grid_desc_k0_m_k1,
-                                                  b_grid_desc_k0_n_k1,
-                                                  c_grid_desc_m_n);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid,
+                                                      karg.p_b_grid,
+                                                      karg.p_c_grid,
+                                                      p_shared,
+                                                      a_grid_desc_k0_m_k1,
+                                                      b_grid_desc_k0_n_k1,
+                                                      c_grid_desc_m_n);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
@@ -103,8 +111,8 @@ template <index_t BlockSize,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
+          index_t MPerXdl,
+          index_t NPerXdl,
           index_t K1Value,
           index_t MXdlPerWave,
           index_t NXdlPerWave,
@@ -142,10 +150,22 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     static constexpr auto I7 = Number<7>{};
 
     // K1 should be Number<...>
-    static constexpr auto K1 = Number<K1Value>{};
+    static constexpr bool is_single_rate_mfma =
+        (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) && K1Value <= 4) ||
+         (is_same<FloatAB, int8_t>::value && K1Value <= 8) ||
+         ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) && K1Value < 32))
+            ? true
+            : false;
+    static constexpr auto is_scale_mfma = false;
+    static constexpr auto K1            = Number<math::max(
+        K1Value,
+        MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+            selected_mfma.k_per_blk)>{};
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    using ElementDataTypeAB = conditional_t<is_same_v<FloatAB, ck::tf32_t>, float, FloatAB>;
+
     __host__ static auto CalculateGridSize(index_t M, index_t N)
     {
         return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
@@ -218,8 +238,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     // Argument
     struct Argument : public Problem, public tensor_operation::device::BaseArgument
     {
-        __host__ Argument(const FloatAB* p_a_grid_,
-                          const FloatAB* p_b_grid_,
+        __host__ Argument(const ElementDataTypeAB* p_a_grid_,
+                          const ElementDataTypeAB* p_b_grid_,
                           FloatC* p_c_grid_,
                           index_t M_,
                           index_t N_,
@@ -234,8 +254,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         {
         }
 
-        const FloatAB* p_a_grid;
-        const FloatAB* p_b_grid;
+        const ElementDataTypeAB* p_a_grid;
+        const ElementDataTypeAB* p_b_grid;
         FloatC* p_c_grid;
     };
 
@@ -311,7 +331,24 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         constexpr auto b_block_space_size_aligned =
             math::integer_least_multiple(b_block_desc_k0_n_k1.GetElementSpaceSize(), max_lds_align);
 
-        return (a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(FloatAB);
+        return (a_block_space_size_aligned + b_block_space_size_aligned) *
+               sizeof(ElementDataTypeAB);
+    }
+
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
     }
 
     template <typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N>
@@ -323,8 +360,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
                       "wrong! K1 need to be known at compile-time");
 
-        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
-                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
         const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
@@ -356,8 +393,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
                       "wrong! K1 need to be known at compile-time");
 
-        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
-                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
         // check gridwise gemm pipeline
@@ -416,16 +453,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         using BlockwiseGemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatABAdjusted,
-                                                                FloatABAdjusted,
+                                                                ElementDataTypeAB,
+                                                                ElementDataTypeAB,
                                                                 FloatAcc,
                                                                 decltype(a_block_desc_k0_m_k1),
                                                                 decltype(b_block_desc_k0_n_k1),
-                                                                MPerXDL,
-                                                                NPerXDL,
+                                                                MPerXdl,
+                                                                NPerXdl,
                                                                 MXdlPerWave,
                                                                 NXdlPerWave,
-                                                                K1>;
+                                                                K1,
+                                                                FloatABAdjusted,
+                                                                FloatABAdjusted>;
 
         return BlockwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n);
     }
@@ -437,8 +476,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
               typename AGridDesc_K0_M_K1,
               typename BGridDesc_K0_N_K1,
               typename CGridDesc_M_N>
-    __device__ static void Run(const FloatAB* p_a_grid,
-                               const FloatAB* p_b_grid,
+    __device__ static void Run(const ElementDataTypeAB* p_a_grid,
+                               const ElementDataTypeAB* p_b_grid,
                                FloatC* p_c_grid,
                                void* __restrict__ p_shared,
                                const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
@@ -499,8 +538,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                 Sequence<K0PerBlock, MPerBlock, K1>,
                                                 ABlockTransferThreadClusterLengths_K0_M_K1,
                                                 ABlockTransferThreadClusterArrangeOrder,
-                                                FloatAB,
-                                                FloatABAdjusted,
+                                                ElementDataTypeAB,
+                                                ElementDataTypeAB,
                                                 decltype(a_grid_desc_k0_m_k1),
                                                 decltype(a_block_desc_k0_m_k1),
                                                 ABlockTransferSrcAccessOrder,
@@ -530,8 +569,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                 Sequence<K0PerBlock, NPerBlock, K1>,
                                                 BBlockTransferThreadClusterLengths_K0_N_K1,
                                                 BBlockTransferThreadClusterArrangeOrder,
-                                                FloatAB,
-                                                FloatABAdjusted,
+                                                ElementDataTypeAB,
+                                                ElementDataTypeAB,
                                                 decltype(b_grid_desc_k0_n_k1),
                                                 decltype(b_block_desc_k0_n_k1),
                                                 BBlockTransferSrcAccessOrder,
@@ -561,17 +600,19 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         // sanity check
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
-            FloatABAdjusted,
-            FloatABAdjusted,
+            ElementDataTypeAB,
+            ElementDataTypeAB,
             FloatAcc,
             decltype(a_block_desc_k0_m_k1),
             decltype(b_block_desc_k0_n_k1),
-            MPerXDL,
-            NPerXDL,
+            MPerXdl,
+            NPerXdl,
             MXdlPerWave,
             NXdlPerWave,
             K1,
-            LoopSched>();
+            LoopSched,
+            FloatABAdjusted,
+            FloatABAdjusted>();
 
         auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
 
@@ -580,10 +621,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
 
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatABAdjusted*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
+            static_cast<ElementDataTypeAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
 
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatABAdjusted*>(p_shared) + a_block_space_size_aligned,
+            static_cast<ElementDataTypeAB*>(p_shared) + a_block_space_size_aligned,
             b_block_desc_k0_n_k1.GetElementSpaceSize());
 
         constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
@@ -704,8 +745,8 @@ template <index_t BlockSize,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
+          index_t MPerXdl,
+          index_t NPerXdl,
           index_t K1Value,
           index_t MXdlPerWave,
           index_t NXdlPerWave,
@@ -743,8 +784,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
                                               MPerBlock,
                                               NPerBlock,
                                               K0PerBlock,
-                                              MPerXDL,
-                                              NPerXDL,
+                                              MPerXdl,
+                                              NPerXdl,
                                               K1Value,
                                               MXdlPerWave,
                                               NXdlPerWave,
@@ -783,8 +824,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
                                                 MPerBlock,
                                                 NPerBlock,
                                                 K0PerBlock,
-                                                MPerXDL,
-                                                NPerXDL,
+                                                MPerXdl,
+                                                NPerXdl,
                                                 K1Value,
                                                 MXdlPerWave,
                                                 NXdlPerWave,
@@ -957,13 +998,15 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
         }
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(FloatC)
+
     __host__ static constexpr bool CheckValidity(const Problem& problem)
     {
         static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
                       "wrong! K1 need to be known at compile-time");
 
-        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
-                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
index f779e63752..a9a463e2c1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -42,23 +42,27 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             const CElementwiseOperation c_element_op,
                             const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+#ifdefined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        constexpr index_t shared_block_size =
+            GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
 
-    __shared__ FloatAB p_shared_block[shared_block_size];
+        __shared__ FloatAB p_shared_block[shared_block_size];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  p_shared_block,
-                                                  a_b_k0_m_k1_grid_desc,
-                                                  b_b_k0_n_k1_grid_desc,
-                                                  c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  c_block_cluster_adaptor);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                      p_b_grid,
+                                                      p_c_grid,
+                                                      p_shared_block,
+                                                      a_b_k0_m_k1_grid_desc,
+                                                      b_b_k0_n_k1_grid_desc,
+                                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op,
+                                                      c_block_cluster_adaptor);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -171,6 +175,22 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
         return (a_block_space_size + b_block_space_size) * sizeof(FloatAB);
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 595a597318..d8f22b682d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -36,13 +36,16 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                          const BElementwiseOperation b_element_op,
                                          const CElementwiseOperation c_element_op)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
 
-    __shared__ uint8_t p_shared[shared_size];
+        __shared__ uint8_t p_shared[shared_size];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
-        karg, static_cast<void*>(p_shared), b2c_map, a_element_op, b_element_op, c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
+            karg, static_cast<void*>(p_shared), b2c_map, a_element_op, b_element_op, c_element_op);
+    }
 #else
     ignore = karg;
     ignore = b2c_map;
@@ -68,8 +71,8 @@ template <index_t BlockSize,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
+          index_t MPerXdl,
+          index_t NPerXdl,
           index_t K1Value,
           index_t MRepeat,
           index_t NRepeat,
@@ -430,6 +433,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                          c_block_size * sizeof(FloatC));
     }
 
+    static constexpr auto MXdlPerWave = MRepeat;
+    static constexpr auto NXdlPerWave = NRepeat;
+    IS_VALID_COMPILATION_PARAMETER_IMPL(FloatC)
+
     __host__ __device__ static constexpr bool CheckValidity(const Argument& karg)
     {
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
@@ -635,14 +642,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     __host__ __device__ static constexpr auto
     GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
     {
-        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXdl);
 
         return make_naive_tensor_descriptor_packed(
             make_tuple(I1,
-                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXDL>{},
+                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXdl>{},
                        I1,
-                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
+                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXdl>{}));
     }
 
     // return block_id to C matrix tile idx (m0, n0, k_split) mapping
@@ -848,8 +855,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             FloatAcc,
             decltype(a_k0_m_k1_block_desc),
             decltype(b_k0_n_k1_block_desc),
-            MPerXDL,
-            NPerXDL,
+            MPerXdl,
+            NPerXdl,
             MRepeat,
             NRepeat,
             K1,
@@ -899,8 +906,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
         // output: register to global memory
         {
-            constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
-            constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+            constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NRepeat * NPerXdl);
 
             constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc =
                 blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
@@ -932,11 +939,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                                                       M1,
                                                       M2,
                                                       M3,
-                                                      M4)), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                                                      M4)), // M1 = MWave, M2 * M3 * M4 = MPerXdl
                     make_freeze_transform(I0),              // freeze nblock
                     make_unmerge_transform(make_tuple(CShuffleNRepeatPerShuffle,
                                                       N1,
-                                                      N2))), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                                                      N2))), // M1 = MWave, M2 * M3 * M4 = MPerXdl
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                 make_tuple(
                     Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
@@ -1007,9 +1014,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                 CElementwiseOperation,      // ElementwiseOperation,
                 CGlobalMemoryDataOperation, // DstInMemOp,
                 Sequence<1,
-                         CShuffleMRepeatPerShuffle * MWave * MPerXDL,
+                         CShuffleMRepeatPerShuffle * MWave * MPerXdl,
                          1,
-                         CShuffleNRepeatPerShuffle * NWave * NPerXDL>, // BlockSliceLengths,
+                         CShuffleNRepeatPerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
                 CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                 Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                 FloatC,               // typename SrcData,
@@ -1028,11 +1035,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                  c_element_op};
 
             constexpr auto mxdlperwave_forward_step =
-                make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXDL, 0, 0);
+                make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXdl, 0, 0);
             constexpr auto nxdlperwave_forward_step =
-                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXdl);
             constexpr auto nxdlperwave_backward_step =
-                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXdl);
 
             static_for<0, MRepeat, CShuffleMRepeatPerShuffle>{}([&](auto mxdlperwave_iter) {
                 constexpr auto mxdlperwave = mxdlperwave_iter;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
index 8822778b52..7d5a8da60f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -46,21 +46,25 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CElementwiseOperation c_element_op,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainK0BlockLoop>(
-        p_a_grid,
-        p_b_grid,
-        p_c_grid,
-        p_shared,
-        a_grid_desc_ak0_m_ak1,
-        b_grid_desc_bk0_n_bk1,
-        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-        a_element_op,
-        b_element_op,
-        c_element_op,
-        block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainK0BlockLoop>(
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            p_shared,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            a_element_op,
+            b_element_op,
+            c_element_op,
+            block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -230,6 +234,22 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
                          c_block_size * sizeof(FloatCShuffle));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
index c3bbece33c..7c559d1f85 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -49,23 +49,27 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CElementwiseOperation c_element_op,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(
-        p_a_grid,
-        p_b_grid,
-        p_c_grid,
-        p_c0_grid,
-        p_shared,
-        a_grid_desc_k0_m_k1,
-        b_grid_desc_k0_n_k1,
-        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-        c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-        a_element_op,
-        b_element_op,
-        c_element_op,
-        block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            p_c0_grid,
+            p_shared,
+            a_grid_desc_k0_m_k1,
+            b_grid_desc_k0_n_k1,
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            a_element_op,
+            b_element_op,
+            c_element_op,
+            block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -235,6 +239,22 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
                          c_block_size * sizeof(FloatC));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
@@ -298,7 +318,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
         const auto NBlock = N / NPerBlock;
 
         constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
-        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t NWave =
+            NXdlPerWave * NPerXdl == 0 ? 1 : NPerBlock / (NXdlPerWave * NPerXdl);
 
         const auto c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
             transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
index 2e288efee2..83f8773a08 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -53,25 +53,29 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CElementwiseOperation c_element_op,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(
-        p_a_grid,
-        p_b_grid,
-        p_c_grid,
-        p_c0_grid,
-        p_c1_grid,
-        p_shared,
-        a_grid_desc_k0_m_k1,
-        b_grid_desc_k0_n_k1,
-        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-        c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-        c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-        a_element_op,
-        b_element_op,
-        c_element_op,
-        block_2_ctile_map);
+        GridwiseGemm::template Run<HasMainKBlockLoop>(
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            p_c0_grid,
+            p_c1_grid,
+            p_shared,
+            a_grid_desc_k0_m_k1,
+            b_grid_desc_k0_n_k1,
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            a_element_op,
+            b_element_op,
+            c_element_op,
+            block_2_ctile_map);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -244,6 +248,22 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
                          c_block_size * sizeof(FloatC));
     }
 
+    template <
+        InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        return ck::tensor_operation::device::IsValidGemmCompilationParameter<
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            FloatC,
+            CGlobalMemoryDataOperation>();
+    }
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     template <typename Block2CTileMap>
     __host__ __device__ static constexpr bool
@@ -307,7 +327,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
         const auto NBlock = N / NPerBlock;
 
         constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
-        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t NWave =
+            NXdlPerWave * NPerXdl == 0 ? 1 : NPerBlock / (NXdlPerWave * NPerXdl);
 
         const auto c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
             transform_tensor_descriptor(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 48ccb49db4..b9c0d671db 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -45,24 +45,27 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_sorted_token_ids,
+            karg.p_sorted_expert_ids,
+            karg.p_max_token_id,
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -80,26 +83,29 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm_2lds(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        p_shared1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_sorted_token_ids,
+            karg.p_sorted_expert_ids,
+            karg.p_max_token_id,
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared,
+            p_shared1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -405,7 +411,9 @@ struct GridwiseMoeGemm
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack / KGroup>{};
+        constexpr index_t MWave           = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize        = BlockSize / (MWave * NWave);
+        constexpr index_t NkSwizzleNumber = Number<WaveSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -611,9 +619,7 @@ struct GridwiseMoeGemm
               AK0{CalculateAK0Padded(K_, KBatch_)},
               BK0{CalculateBK0Padded(K_, KBatch_)},
               MBlock{CalculateMBlock(M_)},
-              NBlock{CalculateNBlock(N_)},
-              BN0Shuffled{CalculateBN0Shuffled(N_)},
-              BK0Shuffled{CalculateBK0Shuffled(K_)}
+              NBlock{CalculateNBlock(N_)}
         {
         }
 
@@ -646,9 +652,6 @@ struct GridwiseMoeGemm
         index_t BK0;
         index_t MBlock;
         index_t NBlock;
-        // FOR PRESHUFFLE ONLY
-        index_t BN0Shuffled;
-        index_t BK0Shuffled;
     };
 
     // Argument
@@ -757,6 +760,9 @@ struct GridwiseMoeGemm
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
+
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM)
         {
@@ -792,7 +798,7 @@ struct GridwiseMoeGemm
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
@@ -940,6 +946,8 @@ struct GridwiseMoeGemm
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -1164,6 +1172,8 @@ struct GridwiseMoeGemm
                                CElementwiseOperation c_element_op)
     {
         ignore                           = b_element_op;
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
             problem.MPadded,
@@ -1172,7 +1182,7 @@ struct GridwiseMoeGemm
             problem.StrideA,
             problem.AK0);
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
             problem.MPadded,
@@ -1418,7 +1428,7 @@ struct GridwiseMoeGemm
             const float* p_scale_b          = p_ds_grid[I1];
 
             static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
-            static_assert(M4 == 4);
+            static_assert(M4 == 4 || M4 == 8);
             const index_t m1 = get_warp_local_1d_id() / NWave;
             const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
 
@@ -1435,8 +1445,8 @@ struct GridwiseMoeGemm
                     p_scale_b += expert_id;
                 }
 
-                vector_type<int32_t, 4> scale_token_ids;
-                vector_type<float, 4> topk_weights;
+                vector_type<int32_t, M4> scale_token_ids;
+                vector_type<float, M4> topk_weights;
                 static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
                     const float scale_b = p_scale_b[n0 * NWave * NPerXdl * PerTokenQuant];
                     static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
@@ -1458,7 +1468,8 @@ struct GridwiseMoeGemm
                                 float scale_a = [&]() {
                                     if constexpr(PerTokenQuant)
                                     {
-                                        index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
+                                        index_t fused_token =
+                                            scale_token_ids.template AsType<index_t>()[m4];
                                         const index_t token_offset = fused_token & 0xffffff;
                                         return token_offset < problem.NumTokens
                                                    ? p_sorted_weights_0[IsInputGemm
@@ -1489,8 +1500,8 @@ struct GridwiseMoeGemm
                                         float up   = scale_a * scale_up * c_thread_buf_up[cidx];
                                         if constexpr(MulRoutedWeight)
                                         {
-                                            gate = gate * topk_weights.AsType<float>()[m4];
-                                            up   = up * topk_weights.AsType<float>()[m4];
+                                            gate = gate * topk_weights.template AsType<float>()[m4];
+                                            up   = up * topk_weights.template AsType<float>()[m4];
                                         }
                                         if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
                                         {
@@ -1509,8 +1520,8 @@ struct GridwiseMoeGemm
                                         float up   = scale_a * scale_up * c_thread_buf_up[cidx];
                                         if constexpr(MulRoutedWeight)
                                         {
-                                            gate = gate * topk_weights.AsType<float>()[m4];
-                                            up   = up * topk_weights.AsType<float>()[m4];
+                                            gate = gate * topk_weights.template AsType<float>()[m4];
+                                            up   = up * topk_weights.template AsType<float>()[m4];
                                         }
                                         if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
                                         {
@@ -1527,8 +1538,9 @@ struct GridwiseMoeGemm
                                         scale_a * scale_b * c_thread_buf[cidx];
                                     if constexpr(MulRoutedWeight)
                                     {
-                                        c_thread_buf_fp32(cidx) = c_thread_buf_fp32(cidx) *
-                                                                  topk_weights.AsType<float>()[m4];
+                                        c_thread_buf_fp32(cidx) =
+                                            c_thread_buf_fp32(cidx) *
+                                            topk_weights.template AsType<float>()[m4];
                                     }
                                 }
                             });
@@ -1538,7 +1550,7 @@ struct GridwiseMoeGemm
             }
             else
             {
-                vector_type<float, 4> topk_weights; // for gemm2 only
+                vector_type<float, M4> topk_weights; // for gemm2 only
                 static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
                     static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
                         static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
@@ -1563,8 +1575,8 @@ struct GridwiseMoeGemm
                                         float up   = c_thread_buf_up[cidx];
                                         if constexpr(MulRoutedWeight)
                                         {
-                                            gate = gate * topk_weights.AsType<float>()[m4];
-                                            up   = up * topk_weights.AsType<float>()[m4];
+                                            gate = gate * topk_weights.template AsType<float>()[m4];
+                                            up   = up * topk_weights.template AsType<float>()[m4];
                                         }
                                         tensor_operation::element_wise::Silu{}(gate, gate);
                                         c_thread_buf_fp32(cidx) = gate * up;
@@ -1575,8 +1587,8 @@ struct GridwiseMoeGemm
                                         float up   = c_thread_buf_up[cidx];
                                         if constexpr(MulRoutedWeight)
                                         {
-                                            gate = gate * topk_weights.AsType<float>()[m4];
-                                            up   = up * topk_weights.AsType<float>()[m4];
+                                            gate = gate * topk_weights.template AsType<float>()[m4];
+                                            up   = up * topk_weights.template AsType<float>()[m4];
                                         }
                                         tensor_operation::element_wise::Gelu{}(gate, gate);
                                         c_thread_buf_fp32(cidx) = gate * up;
@@ -1587,8 +1599,9 @@ struct GridwiseMoeGemm
                                     c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
                                     if constexpr(MulRoutedWeight)
                                     {
-                                        c_thread_buf_fp32(cidx) = topk_weights.AsType<float>()[m4] *
-                                                                  c_thread_buf_fp32[cidx];
+                                        c_thread_buf_fp32(cidx) =
+                                            topk_weights.template AsType<float>()[m4] *
+                                            c_thread_buf_fp32[cidx];
                                     }
                                 }
                             });
@@ -1874,6 +1887,8 @@ struct GridwiseMoeGemm
                                     CElementwiseOperation c_element_op)
     {
         ignore                           = b_element_op;
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
             problem.MPadded,
@@ -1882,7 +1897,7 @@ struct GridwiseMoeGemm
             problem.StrideA,
             problem.AK0);
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
             problem.MPadded,
@@ -2136,7 +2151,7 @@ struct GridwiseMoeGemm
             const float* p_scale_b          = p_ds_grid[I1];
 
             static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
-            static_assert(M4 == 4);
+            static_assert(M4 == 4 || M4 == 8);
             const index_t m1 = get_warp_local_1d_id() / NWave;
             const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
 
@@ -2153,8 +2168,8 @@ struct GridwiseMoeGemm
                     p_scale_b += expert_id;
                 }
 
-                vector_type<int32_t, 4> scale_token_ids;
-                vector_type<float, 4> topk_weights;
+                vector_type<int32_t, M4> scale_token_ids;
+                vector_type<float, M4> topk_weights;
                 static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
                     const float scale_b = p_scale_b[n0 * NWave * NPerXdl * PerTokenQuant];
                     static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
@@ -2176,7 +2191,8 @@ struct GridwiseMoeGemm
                                 float scale_a = [&]() {
                                     if constexpr(PerTokenQuant)
                                     {
-                                        index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
+                                        index_t fused_token =
+                                            scale_token_ids.template AsType<index_t>()[m4];
                                         const index_t token_offset = fused_token & 0xffffff;
                                         return token_offset < problem.NumTokens
                                                    ? p_sorted_weights_0[IsInputGemm
@@ -2207,8 +2223,8 @@ struct GridwiseMoeGemm
                                         float up   = scale_a * scale_up * c_thread_buf_up[cidx];
                                         if constexpr(MulRoutedWeight)
                                         {
-                                            gate = gate * topk_weights.AsType<float>()[m4];
-                                            up   = up * topk_weights.AsType<float>()[m4];
+                                            gate = gate * topk_weights.template AsType<float>()[m4];
+                                            up   = up * topk_weights.template AsType<float>()[m4];
                                         }
                                         if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
                                         {
@@ -2227,8 +2243,8 @@ struct GridwiseMoeGemm
                                         float up   = scale_a * scale_up * c_thread_buf_up[cidx];
                                         if constexpr(MulRoutedWeight)
                                         {
-                                            gate = gate * topk_weights.AsType<float>()[m4];
-                                            up   = up * topk_weights.AsType<float>()[m4];
+                                            gate = gate * topk_weights.template AsType<float>()[m4];
+                                            up   = up * topk_weights.template AsType<float>()[m4];
                                         }
                                         if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
                                         {
@@ -2245,8 +2261,9 @@ struct GridwiseMoeGemm
                                         scale_a * scale_b * c_thread_buf[cidx];
                                     if constexpr(MulRoutedWeight)
                                     {
-                                        c_thread_buf_fp32(cidx) = c_thread_buf_fp32(cidx) *
-                                                                  topk_weights.AsType<float>()[m4];
+                                        c_thread_buf_fp32(cidx) =
+                                            c_thread_buf_fp32(cidx) *
+                                            topk_weights.template AsType<float>()[m4];
                                     }
                                 }
                             });
@@ -2256,7 +2273,7 @@ struct GridwiseMoeGemm
             }
             else
             {
-                vector_type<float, 4> topk_weights; // for gemm2 only
+                vector_type<float, M4> topk_weights; // for gemm2 only
                 static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
                     static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
                         static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
@@ -2281,8 +2298,8 @@ struct GridwiseMoeGemm
                                         float up   = c_thread_buf_up[cidx];
                                         if constexpr(MulRoutedWeight)
                                         {
-                                            gate = gate * topk_weights.AsType<float>()[m4];
-                                            up   = up * topk_weights.AsType<float>()[m4];
+                                            gate = gate * topk_weights.template AsType<float>()[m4];
+                                            up   = up * topk_weights.template AsType<float>()[m4];
                                         }
                                         tensor_operation::element_wise::Silu{}(gate, gate);
                                         c_thread_buf_fp32(cidx) = gate * up;
@@ -2293,8 +2310,8 @@ struct GridwiseMoeGemm
                                         float up   = c_thread_buf_up[cidx];
                                         if constexpr(MulRoutedWeight)
                                         {
-                                            gate = gate * topk_weights.AsType<float>()[m4];
-                                            up   = up * topk_weights.AsType<float>()[m4];
+                                            gate = gate * topk_weights.template AsType<float>()[m4];
+                                            up   = up * topk_weights.template AsType<float>()[m4];
                                         }
                                         tensor_operation::element_wise::Gelu{}(gate, gate);
                                         c_thread_buf_fp32(cidx) = gate * up;
@@ -2305,8 +2322,9 @@ struct GridwiseMoeGemm
                                     c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
                                     if constexpr(MulRoutedWeight)
                                     {
-                                        c_thread_buf_fp32(cidx) = topk_weights.AsType<float>()[m4] *
-                                                                  c_thread_buf_fp32[cidx];
+                                        c_thread_buf_fp32(cidx) =
+                                            topk_weights.template AsType<float>()[m4] *
+                                            c_thread_buf_fp32[cidx];
                                     }
                                 }
                             });
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
index 0d78957b07..5f5e24fb9f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -45,26 +45,29 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        karg.p_a_scale_grid,
-        karg.p_b_scale_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_sorted_token_ids,
+            karg.p_sorted_expert_ids,
+            karg.p_max_token_id,
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            karg.p_a_scale_grid,
+            karg.p_b_scale_grid,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -82,28 +85,31 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm_2lds(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        karg.p_a_scale_grid,
-        karg.p_b_scale_grid,
-        p_shared,
-        p_shared1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_sorted_token_ids,
+            karg.p_sorted_expert_ids,
+            karg.p_max_token_id,
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            karg.p_a_scale_grid,
+            karg.p_b_scale_grid,
+            p_shared,
+            p_shared1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -410,7 +416,9 @@ struct GridwiseMoeGemmBlockScale
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack / KGroup>{};
+        constexpr index_t MWave           = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize        = BlockSize / (MWave * NWave);
+        constexpr index_t NkSwizzleNumber = Number<WaveSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -618,9 +626,7 @@ struct GridwiseMoeGemmBlockScale
               AK0{CalculateAK0Padded(K_, KBatch_)},
               BK0{CalculateBK0Padded(K_, KBatch_)},
               MBlock{CalculateMBlock(M_)},
-              NBlock{CalculateNBlock(N_)},
-              BN0Shuffled{CalculateBN0Shuffled(N_)},
-              BK0Shuffled{CalculateBK0Shuffled(K_)}
+              NBlock{CalculateNBlock(N_)}
         {
         }
 
@@ -653,9 +659,6 @@ struct GridwiseMoeGemmBlockScale
         index_t BK0;
         index_t MBlock;
         index_t NBlock;
-        // FOR PRESHUFFLE ONLY
-        index_t BN0Shuffled;
-        index_t BK0Shuffled;
     };
 
     // Argument
@@ -771,6 +774,8 @@ struct GridwiseMoeGemmBlockScale
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM)
         {
@@ -806,7 +811,7 @@ struct GridwiseMoeGemmBlockScale
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
@@ -957,6 +962,8 @@ struct GridwiseMoeGemmBlockScale
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -1183,6 +1190,8 @@ struct GridwiseMoeGemmBlockScale
                                CElementwiseOperation c_element_op)
     {
         ignore                           = b_element_op;
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
             problem.MPadded,
@@ -1191,7 +1200,7 @@ struct GridwiseMoeGemmBlockScale
             problem.StrideA,
             problem.AK0);
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
             problem.MPadded,
@@ -1374,10 +1383,11 @@ struct GridwiseMoeGemmBlockScale
         constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
 
-        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
-        auto a_thread_offset =
-            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+        constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
+        auto a_thread_offset       = get_thread_local_1d_id() % MPerXdl +
+                               (get_thread_local_1d_id() / WaveSize) / NWaves * MPerXdl;
 
         constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
@@ -1578,7 +1588,7 @@ struct GridwiseMoeGemmBlockScale
 
             static_assert(N0 * N1 * N2 * N3 * N4 == NPerBlock);
             static_assert(M0 * M1 * M2 == MPerBlock);
-            static_assert(N4 == 4);
+            static_assert(N4 == 4 || N4 == 8);
             const index_t m1 = get_warp_local_1d_id() / NWave;
             const index_t m2 = threadIdx.x % get_warp_size() % M2;
 
@@ -1929,6 +1939,8 @@ struct GridwiseMoeGemmBlockScale
                                     CElementwiseOperation c_element_op)
     {
         ignore                           = b_element_op;
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
             problem.MPadded,
@@ -1937,7 +1949,7 @@ struct GridwiseMoeGemmBlockScale
             problem.StrideA,
             problem.AK0);
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
             problem.MPadded,
@@ -2126,10 +2138,11 @@ struct GridwiseMoeGemmBlockScale
         constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
 
-        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
-        auto a_thread_offset =
-            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+        constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
+        auto a_thread_offset       = get_thread_local_1d_id() % MPerXdl +
+                               (get_thread_local_1d_id() / WaveSize) / NWaves * MPerXdl;
 
         constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
@@ -2321,7 +2334,7 @@ struct GridwiseMoeGemmBlockScale
 
             static_assert(N0 * N1 * N2 * N3 * N4 == NPerBlock);
             static_assert(M0 * M1 * M2 == MPerBlock);
-            static_assert(N4 == 4);
+            static_assert(N4 == 4 || N4 == 8);
             const index_t m1 = get_warp_local_1d_id() / NWave;
             const index_t m2 = threadIdx.x % get_warp_size() % M2;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
index ac3a887155..9066decc0a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
@@ -49,6 +49,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx9__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -68,6 +70,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         karg.a_element_op,
         karg.b_element_op,
         karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -87,27 +90,30 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx9__)
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared_0,
-        p_shared_1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_sorted_token_ids,
+            karg.p_sorted_expert_ids,
+            karg.p_max_token_id,
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared_0,
+            p_shared_1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -850,6 +856,10 @@ struct GridwiseMoeGemmMX
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
+
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -881,9 +891,8 @@ struct GridwiseMoeGemmMX
             // kfold and mpair dimension is not always required.
             // more dimension in merge_transform increase the difficulty of generating immarg offset
             // for compiler.
-            constexpr auto WaveSize = 64;
-            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-            constexpr auto M1       = MPerBlock / M0;
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
@@ -968,6 +977,10 @@ struct GridwiseMoeGemmMX
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
+
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -995,9 +1008,8 @@ struct GridwiseMoeGemmMX
         }
         else // RowMajor B
         {
-            constexpr auto WaveSize = 64;
-            constexpr auto N0       = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
-            constexpr auto N1       = NPerBlock / N0;
+            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1 = NPerBlock / N0;
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
@@ -1161,6 +1173,8 @@ struct GridwiseMoeGemmMX
         }
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
index a8417b2e02..6854e64092 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
@@ -48,25 +48,28 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_sorted_token_ids,
+            karg.p_sorted_expert_ids,
+            karg.p_max_token_id,
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -86,6 +89,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx9__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
@@ -107,6 +112,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         karg.a_element_op,
         karg.b_element_op,
         karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -783,6 +789,10 @@ struct GridwiseMoeGemmMXBNS
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
+
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -813,9 +823,8 @@ struct GridwiseMoeGemmMXBNS
             // kfold and mpair dimension is not always required.
             // more dimension in merge_transform increase the difficulty of generating immarg offset
             // for compiler.
-            constexpr auto WaveSize = 64;
-            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-            constexpr auto M1       = MPerBlock / M0;
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
@@ -900,6 +909,10 @@ struct GridwiseMoeGemmMXBNS
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave    = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
+
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -926,9 +939,8 @@ struct GridwiseMoeGemmMXBNS
         }
         else // RowMajor B
         {
-            constexpr auto WaveSize = 64;
-            constexpr auto N0       = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
-            constexpr auto N1       = NPerBlock / N0;
+            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1 = NPerBlock / N0;
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
@@ -1093,6 +1105,8 @@ struct GridwiseMoeGemmMXBNS
         }
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
index 46e9a19ae6..c367079aab 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
@@ -48,25 +48,28 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_sorted_token_ids,
+            karg.p_sorted_expert_ids,
+            karg.p_max_token_id,
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -85,27 +88,30 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx9__)
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared_0,
-        p_shared_1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_sorted_token_ids,
+            karg.p_sorted_expert_ids,
+            karg.p_max_token_id,
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+            karg.p_ds_grid,
+            karg.p_c_grid,
+            p_shared_0,
+            p_shared_1,
+            karg,
+            karg.a_element_op,
+            karg.b_element_op,
+            karg.c_element_op);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -270,11 +276,11 @@ struct GridwiseMoeGemmMX_BPreshuffle
         return math::integer_least_multiple(N, NPerBlock);
     }
 
-    __host__ static auto CalculateBN0Shuffled(index_t N)
+    __host__ __device__ static auto CalculateBN0Shuffled(index_t N)
     {
         return math::integer_divide_ceil(N, NLane);
     }
-    __host__ static auto CalculateBK0Shuffled(index_t K)
+    __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
     {
         return math::integer_divide_ceil(K, KLane * KPack);
     }
@@ -467,7 +473,9 @@ struct GridwiseMoeGemmMX_BPreshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack>{};
+        constexpr index_t MWave           = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize        = BlockSize / (MWave * NWave);
+        constexpr index_t NkSwizzleNumber = Number<WaveSize * KPack>{};
         return make_naive_tensor_descriptor_packed(
             make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber));
     }
@@ -700,9 +708,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
               AK0{CalculateAK0Padded(K_, KBatch_)},
               BK0{CalculateBK0Padded(K_, KBatch_)},
               MBlock{CalculateMBlock(M_)},
-              NBlock{CalculateNBlock(N_)},
-              BN0Shuffled{CalculateBN0Shuffled(N_)},
-              BK0Shuffled{CalculateBK0Shuffled(K_)}
+              NBlock{CalculateNBlock(N_)}
         {
         }
 
@@ -738,9 +744,6 @@ struct GridwiseMoeGemmMX_BPreshuffle
         index_t BK0;
         index_t MBlock;
         index_t NBlock;
-        // FOR PRESHUFFLE ONLY
-        index_t BN0Shuffled;
-        index_t BK0Shuffled;
     };
 
     // Argument
@@ -869,6 +872,9 @@ struct GridwiseMoeGemmMX_BPreshuffle
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWave    = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t WaveSize = BlockSize / (MWave * NWave);
+
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -900,9 +906,8 @@ struct GridwiseMoeGemmMX_BPreshuffle
             // kfold and mpair dimension is not always required.
             // more dimension in merge_transform increase the difficulty of generating immarg offset
             // for compiler.
-            constexpr auto WaveSize = 64;
-            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-            constexpr auto M1       = MPerBlock / M0;
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
@@ -1060,6 +1065,8 @@ struct GridwiseMoeGemmMX_BPreshuffle
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType)
+
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
@@ -1292,6 +1299,8 @@ struct GridwiseMoeGemmMX_BPreshuffle
                                CElementwiseOperation c_element_op)
     {
         ignore                           = b_element_op;
+        index_t BN0Shuffled = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled = CalculateBK0Shuffled(problem.K);        
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
             problem.MPadded,
@@ -1300,7 +1309,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
             problem.StrideA,
             problem.AK0);
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
             problem.MPadded,
@@ -1998,6 +2007,8 @@ struct GridwiseMoeGemmMX_BPreshuffle
     {
         ignore                           = a_element_op;
         ignore                           = b_element_op;
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
             problem.MPadded,
@@ -2006,7 +2017,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
             problem.StrideA,
             problem.AK0);
         const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+            MakeBGridDescriptor_Preshuffled(BN0Shuffled, BK0Shuffled);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
             problem.MPadded,
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 2305997f70..5da9722a4b 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1900,6 +1900,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
                         const DstSliceOriginIdx&,
                         DstBuffer& dst_buf) const
     {
+        ElementwiseOperation element_op_{};
         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                       "wrong! Desc need to known at compile-time");
 
@@ -1985,7 +1986,6 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
             });
         });
     }
-    ElementwiseOperation element_op_{};
 };
 
 // Specialized for gfx12
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp
index ea074144b6..5682117f76 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -48,8 +48,6 @@ struct ThreadwiseTensorSliceTransfer_v7r3
 {
     static constexpr auto I0 = Number<0>{};
 
-    static constexpr auto SrcScalarPerVector = SrcScalarPerVectors{}[I0];
-
     static constexpr index_t nDim = SliceLengths::Size();
 
     static constexpr index_t nSrc = SrcDescs::Size();
@@ -67,6 +65,10 @@ struct ThreadwiseTensorSliceTransfer_v7r3
                               Number<Descs::Size()>{});
     }
 
+    static constexpr auto SrcScalarPerVector =
+        reduce_on_sequence(SrcScalarPerVectors{},
+                           math::minimize<index_t>{},
+                           Number<1>{}); // GetMinSrcScalarPerVector(); SrcScalarPerVectors{}[I0];
     using SrcCoords = decltype(MakeCoordinates(SrcDescs{}, StaticallyIndexedArray<Index, nSrc>{}));
     using DstCoords = decltype(MakeCoordinates(DstDescs{}, StaticallyIndexedArray<Index, nDst>{}));
 
@@ -165,6 +167,9 @@ struct ThreadwiseTensorSliceTransfer_v7r3
 
                 oob_val = oob_val & is_src_valid;
 
+                // TODO: With column-major matrices this step restricts the transferred tensor slice
+                // to just one element, which consequently prevents using atomic operations if the
+                // matrix data type is on 16 bits.
                 if constexpr(SrcScalarPerVectors{}[i] == 1)
                 {
                     auto data_types = SrcDatas{};
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index 429df2413f..bca68764f9 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -270,8 +270,8 @@ struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8,
               class FloatA,
               class FloatB,
               class FloatC,
-              bool neg_a = false,
-              bool neg_b = false,
+              bool neg_a = true,
+              bool neg_b = true,
               bool clamp = false>
     __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
     {
@@ -390,8 +390,8 @@ struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8_gfx12,
               class FloatA,
               class FloatB,
               class FloatC,
-              bool neg_a = false,
-              bool neg_b = false,
+              bool neg_a = true,
+              bool neg_b = true,
               bool clamp = false>
     __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
     {
@@ -793,6 +793,9 @@ struct WmmaGemm
             "base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), "
             "((f8 or bf8, f8 or bf8), float), (int8, int32) or (int4, int32)!");
         static_for<0, KPack / wmma_instr.k_per_wmma, 1>{}([&](auto k) {
+            // Integer wmma operators need extra input flags to indicate if the input is signed or
+            // unsigned. At the moment CK supports only signed integer inputs, so these flags are
+            // hardcoded.
             if constexpr(!TransposeC)
             {
                 wmma_instr.template run<MPerWmma, NPerWmma>(p_a_wave[k], p_b_wave[k], p_c_thread);
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 0125aa086e..ce2d9299f9 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -20,6 +20,7 @@ static constexpr bool is_scale_mfma_data_type()
            is_same_v<U, bf6_t> || is_same_v<U, f4_t>;
 }
 
+#ifndef CK_CODE_GEN_RTC
 /**
  * @brief Define scale data types that have hardware support for MX GEMMs
  */
@@ -28,6 +29,7 @@ static constexpr bool is_scale_mfma_scale_type()
 {
     return is_same_v<T, e8m0_bexp_t>;
 }
+#endif
 
 /**
  * @brief Combination of data types that have hardware support for MX GEMMs
@@ -41,11 +43,11 @@ static constexpr bool scale_mfma_hw_support()
 
 enum struct MfmaInstr
 {
-    mfma_f32_32x32x1xf32 = 0,
-    mfma_f32_16x16x1xf32,
-    mfma_f32_4x4x1xf32,
-    mfma_f32_32x32x2xf32,
-    mfma_f32_16x16x4xf32,
+    mfma_f32_32x32x1f32 = 0,
+    mfma_f32_16x16x1f32,
+    mfma_f32_4x4x1f32,
+    mfma_f32_32x32x2f32,
+    mfma_f32_16x16x4f32,
     mfma_f32_32x32x4f16,
     mfma_f32_16x16x4f16,
     mfma_f32_4x4x4f16,
@@ -78,6 +80,8 @@ enum struct MfmaInstr
     mfma_f32_16x16x128f8f6f4,
     mfma_scale_f32_32x32x64f8f6f4,
     mfma_scale_f32_16x16x128f8f6f4,
+    mfma_f32_16x16x8xf32, // tf32
+    mfma_f32_32x32x4xf32,
     // gfx11
     wmma_f32_16x16x16_f16,
     wmma_f32_16x16x16_bf16,
@@ -98,7 +102,7 @@ template <MfmaInstr instr>
 struct mfma_type;
 
 template <>
-struct mfma_type<MfmaInstr::mfma_f32_32x32x1xf32>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x1f32>
 {
     static constexpr index_t group_size          = 4;
     static constexpr index_t num_groups_per_blk  = 4;
@@ -120,7 +124,7 @@ struct mfma_type<MfmaInstr::mfma_f32_32x32x1xf32>
 };
 
 template <>
-struct mfma_type<MfmaInstr::mfma_f32_32x32x2xf32>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x2f32>
 {
     static constexpr index_t group_size          = 4;
     static constexpr index_t num_groups_per_blk  = 4;
@@ -142,7 +146,7 @@ struct mfma_type<MfmaInstr::mfma_f32_32x32x2xf32>
 };
 
 template <>
-struct mfma_type<MfmaInstr::mfma_f32_16x16x4xf32>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x4f32>
 {
     static constexpr index_t group_size          = 4;
     static constexpr index_t num_groups_per_blk  = 1;
@@ -164,7 +168,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x4xf32>
 };
 
 template <>
-struct mfma_type<MfmaInstr::mfma_f32_16x16x1xf32>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x1f32>
 {
     static constexpr index_t group_size          = 4;
     static constexpr index_t num_groups_per_blk  = 1;
@@ -187,7 +191,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x1xf32>
 
 // treat 4x4x1 as a single-blk 4x64 mfma
 template <>
-struct mfma_type<MfmaInstr::mfma_f32_4x4x1xf32>
+struct mfma_type<MfmaInstr::mfma_f32_4x4x1f32>
 {
     static constexpr index_t group_size          = 4;
     static constexpr index_t num_groups_per_blk  = 1;
@@ -947,6 +951,70 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
     }
 };
 
+/**
+ * num_threads_per_blk == n_per_blk
+ * num_regs_per_blk * num_input_blks == m_per_blk
+ * num_regs_per_blk * wave_size == m_per_blk * n_per_blk
+ *
+ * group_size * num_groups_per_blk == num_regs_per_blk
+ *
+ * num_regs_per_blk is output(CD) register size which is determined by the instruction.
+ * k_per_blk(K1PerXdlops) is input(AB) register size which is determined by the instruction.
+ * group_size is corresponding to CD rows mapping. see: GetBeginOfThreadBlk()
+ *
+ * is_k_reduction = (k_per_blk == KPerXdlops) ? false: true.
+ *
+ * if (is_k_reduction){
+ *      num_output_blks == 1;
+ * } else {
+ *      num_input_blks == num_output_blks;
+ * }
+ */
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x8xf32>
+{
+    static constexpr index_t wave_size           = 64;        // fixed
+    static constexpr index_t m_per_blk           = 16;        // from the instruction
+    static constexpr index_t n_per_blk           = 16;        // from the instruction
+    static constexpr index_t num_threads_per_blk = n_per_blk; // 16
+    static constexpr index_t num_regs_per_blk    = m_per_blk * n_per_blk / wave_size; // 4
+    static constexpr index_t num_input_blks      = m_per_blk / num_regs_per_blk;      // 4
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t k_per_blk           = 2; //  k_per_blk(K1PerXdlops) should be 2.
+    static constexpr bool is_k_reduction         = true;
+
+    // AB register size : 2,  register size: 4
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x8xf32<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x4xf32>
+{
+    static constexpr index_t wave_size           = 64;        // fixed
+    static constexpr index_t m_per_blk           = 32;        // from the instruction
+    static constexpr index_t n_per_blk           = 32;        // from the instruction
+    static constexpr index_t num_threads_per_blk = n_per_blk; // 32
+    static constexpr index_t num_regs_per_blk    = m_per_blk * n_per_blk / wave_size; // 16
+    static constexpr index_t num_input_blks      = m_per_blk / num_regs_per_blk;      // 2
+    static constexpr index_t group_size          = 4; // corresponding to CD rows mapping
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t k_per_blk           = 2;
+    static constexpr bool is_k_reduction         = true;
+    // AB register size: 2, CD register size: 16
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x4xf32<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
 // gfx11
 struct mfma_type_gfx11_base
 {
@@ -1116,6 +1184,20 @@ struct mfma_type<MfmaInstr::wmma_unsupport_16x16_gfx12> : public mfma_type_gfx12
     }
 };
 
+/**
+ * @class MfmaSelector
+ * @brief Selects the appropriate MFMA instruction type and configuration for given data types
+ *          and tile sizes on AMD GPUs.
+ *
+ * @tparam base_type        The base data type for the matrix operation (e.g., float, half_t).
+ * @tparam MPerXdlops       The number of rows per XDLops tile.
+ * @tparam NPerXdlops       The number of columns per XDLops tile.
+ * @tparam additional_type  (Optional) Additional data type for mixed-precision or special cases.
+ *                          Defaults to base_type.
+ * @tparam is_single_rate_mfma (Optional) Whether to use single-rate MFMA instructions.
+ *                          Defaults to false.
+ * @tparam is_scale_mfma    (Optional) Whether to use scale MFMA instructions. Defaults to false.
+ */
 template <typename base_type,
           index_t MPerXdlops,
           index_t NPerXdlops,
@@ -1147,37 +1229,37 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<float, 64, 64>()
     {
-        return MfmaInstr::mfma_f32_32x32x1xf32;
+        return MfmaInstr::mfma_f32_32x32x1f32;
     }
 
     template <>
     constexpr auto GetMfma<float, 32, 64>()
     {
-        return MfmaInstr::mfma_f32_32x32x1xf32;
+        return MfmaInstr::mfma_f32_32x32x1f32;
     }
 
     template <>
     constexpr auto GetMfma<float, 16, 64>()
     {
-        return MfmaInstr::mfma_f32_16x16x1xf32;
+        return MfmaInstr::mfma_f32_16x16x1f32;
     }
 
     template <>
     constexpr auto GetMfma<float, 8, 64>()
     {
-        return MfmaInstr::mfma_f32_4x4x1xf32;
+        return MfmaInstr::mfma_f32_4x4x1f32;
     }
 
     template <>
     constexpr auto GetMfma<float, 4, 64>()
     {
-        return MfmaInstr::mfma_f32_4x4x1xf32;
+        return MfmaInstr::mfma_f32_4x4x1f32;
     }
 
     template <>
     constexpr auto GetMfma<float, 32, 32>()
     {
-        return MfmaInstr::mfma_f32_32x32x2xf32;
+        return MfmaInstr::mfma_f32_32x32x2f32;
     }
 
     template <>
@@ -1188,7 +1270,35 @@ struct MfmaSelector
 #elif defined(__gfx11__)
         return MfmaInstr::wmma_unsupport_16x16_gfx11;
 #else
-        return MfmaInstr::mfma_f32_16x16x4xf32;
+        return MfmaInstr::mfma_f32_16x16x4f32;
+#endif
+    }
+
+    template <>
+    constexpr auto GetMfma<tf32_t, 32, 32>()
+    {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#elif defined(__gfx942__)
+        return MfmaInstr::mfma_f32_32x32x4xf32;
+#else
+        return MfmaInstr::mfma_f32_32x32x2f32;
+#endif
+    }
+
+    template <>
+    constexpr auto GetMfma<tf32_t, 16, 16>()
+    {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#elif defined(__gfx942__)
+        return MfmaInstr::mfma_f32_16x16x8xf32;
+#else
+        return MfmaInstr::mfma_f32_16x16x4f32;
 #endif
     }
 
@@ -1384,25 +1494,31 @@ struct MfmaSelector
     }
 
     template <>
-    constexpr auto GetMfma<f8_t, 32, 32, f8_t, false, true>()
+    constexpr auto GetMfma<f8_t, 32, 32, f8_t, is_single_rate_mfma, true>()
     {
         return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
     }
 
     template <>
-    constexpr auto GetMfma<bf8_t, 32, 32, f8_t, false, true>()
+    constexpr auto GetMfma<bf8_t, 32, 32, f8_t, is_single_rate_mfma, true>()
     {
         return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
     }
     template <>
-    constexpr auto GetMfma<f4_t, 32, 32, f4_t, false, true>()
+    constexpr auto GetMfma<f4_t, 32, 32, f4_t, is_single_rate_mfma, true>()
     {
         return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
     }
     template <>
-    constexpr auto GetMfma<f4_t, 16, 16, f4_t, false, true>()
+    constexpr auto GetMfma<f4_t, 16, 16, f4_t, is_single_rate_mfma, true>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+#endif
     }
 
     template <>
@@ -1432,48 +1548,84 @@ struct MfmaSelector
     }
 
     template <>
-    constexpr auto GetMfma<f8_t, 16, 16, f8_t, false, true>()
+    constexpr auto GetMfma<f8_t, 16, 16, f8_t, is_single_rate_mfma, true>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+#endif
     }
 
     template <>
-    constexpr auto GetMfma<bf8_t, 16, 16, bf8_t, false, true>()
+    constexpr auto GetMfma<bf8_t, 16, 16, bf8_t, is_single_rate_mfma, true>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+#endif
     }
 
     template <>
-    constexpr auto GetMfma<f8_t, 16, 16, bf8_t, false, true>()
+    constexpr auto GetMfma<f8_t, 16, 16, bf8_t, is_single_rate_mfma, true>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+#endif
     }
 
     template <>
-    constexpr auto GetMfma<bf8_t, 16, 16, f8_t, false, true>()
+    constexpr auto GetMfma<bf8_t, 16, 16, f8_t, is_single_rate_mfma, true>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+#endif
     }
 
     template <>
-    constexpr auto GetMfma<f6_t, 32, 32, f6_t, false, true>()
+    constexpr auto GetMfma<f6_t, 32, 32, f6_t, is_single_rate_mfma, true>()
     {
         return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
     }
     template <>
-    constexpr auto GetMfma<f6_t, 16, 16, f6_t, false, true>()
+    constexpr auto GetMfma<f6_t, 16, 16, f6_t, is_single_rate_mfma, true>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+#endif
     }
     template <>
-    constexpr auto GetMfma<bf6_t, 32, 32, bf6_t, false, true>()
+    constexpr auto GetMfma<bf6_t, 32, 32, bf6_t, is_single_rate_mfma, true>()
     {
         return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
     }
     template <>
-    constexpr auto GetMfma<bf6_t, 16, 16, bf6_t, false, true>()
+    constexpr auto GetMfma<bf6_t, 16, 16, bf6_t, is_single_rate_mfma, true>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+#endif
     }
 
     template <>
@@ -1852,9 +2004,9 @@ struct XdlopsGemm
                        Sequence<8>{}));
     }
 
-    __device__ static constexpr index_t GetRegSizePerXdlops()
+    __device__ __host__ static constexpr index_t GetRegSizePerXdlops()
     {
-        return MPerXdlops * NPerXdlops / mfma_instr.wave_size;
+        return mfma_instr.num_regs_per_blk;
     }
 
     __device__ static constexpr index_t GetWaveSize() { return mfma_instr.wave_size; }
@@ -1864,12 +2016,12 @@ struct XdlopsGemm
     {
         static_assert(
             is_same<base_type, double>::value || is_same<base_type, float>::value ||
-                is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
-                is_same<base_type, int8_t>::value || is_same<base_type, f8_t>::value ||
-                is_same<base_type, bf8_t>::value ||
+                is_same<base_type, tf32_t>::value || is_same<base_type, half_t>::value ||
+                is_same<base_type, bhalf_t>::value || is_same<base_type, int8_t>::value ||
+                is_same<base_type, f8_t>::value || is_same<base_type, bf8_t>::value ||
                 (is_same<base_type, f8_t>::value && is_same<additional_type, bf8_t>::value) ||
                 (is_same<base_type, bf8_t>::value && is_same<additional_type, f8_t>::value),
-            "base base_type must be double, float, half, bfloat16, int8_t, f8_t or bf8_t!");
+            "base_type must be double, float, tf32_t, half, bfloat16, int8_t, f8_t or bf8_t!");
 
         static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
             if constexpr(!TransposeC)
@@ -1961,7 +2113,7 @@ struct XdlopsGemm
     {
         const auto laneId = GetLaneId();
 #if defined(__gfx11__)
-        const auto blk_idx = GetGfx11InputBlkIdx<true>();
+        const auto blk_idx = GetGfx11InputBlkIdx<!TransposeC>();
 #else
         const auto blk_idx = GetBlkIdx();
 #endif
@@ -1983,7 +2135,7 @@ struct XdlopsGemm
     {
         const auto laneId = GetLaneId();
 #if defined(__gfx11__)
-        const auto blk_idx = GetGfx11InputBlkIdx<false>();
+        const auto blk_idx = GetGfx11InputBlkIdx<TransposeC>();
 #else
         const auto blk_idx = GetBlkIdx();
 #endif
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
index 977c622f06..03c1945d95 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -13,6 +13,14 @@
 namespace ck {
 namespace tensor_operation {
 
+/**
+ * @brief Enable custom tensor transform for convolution backward data output.
+ *
+ * When set to 1, this macro enables a custom transformation of the output tensor
+ * in convolution backward data operations.
+ */
+#define CK_USE_CUSTOM_TENSOR_TRANSFORM_FOR_BWD_DATA_OUT 1
+
 template <
     index_t NDimSpatial,
     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization,
@@ -705,6 +713,12 @@ struct TransformConvBwdDataToGemm_v1
 
             if constexpr(NDimSpatial == 2)
             {
+                const index_t K0PerBlock = GemmKPerBlock / AK1;
+                const index_t AK0        = math::integer_divide_ceil(YDotSlice * XDotSlice * K_,
+                                                              AK1 * K0PerBlock * batch_k_) *
+                                    K0PerBlock;
+
+#if CK_USE_CUSTOM_TENSOR_TRANSFORM_FOR_BWD_DATA_OUT == 0
                 // A: output tensor
                 const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
                     out_grid_desc,
@@ -762,12 +776,6 @@ struct TransformConvBwdDataToGemm_v1
                         make_tuple(GemmKPerBlock, GemmMPerBlock),
                         Sequence<true, DoPadGemmM>{});
 
-                const index_t K0PerBlock = GemmKPerBlock / AK1;
-                const index_t AK0 =
-                    math::integer_divide_ceil(out_gemmk_gemmm_padded_grid_desc.GetLength(I0),
-                                              AK1 * K0PerBlock * batch_k_) *
-                    K0PerBlock;
-
                 const auto out_gemmak0_gemmm_gemmak1_grid_desc = transform_tensor_descriptor(
                     out_gemmk_gemmm_padded_grid_desc,
                     make_tuple(make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1)),
@@ -775,8 +783,46 @@ struct TransformConvBwdDataToGemm_v1
                                    out_gemmk_gemmm_padded_grid_desc.GetLength(I1))),
                     make_tuple(Sequence<0>{}, Sequence<1>{}),
                     make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
                 return out_gemmak0_gemmm_gemmak1_grid_desc;
+#else
+                const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+                    out_grid_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Ho_, I0, I0),
+                               make_pad_transform(Wo_, I0, I0),
+                               make_pass_through_transform(K_)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+                const auto out_n_hop_wop_k_grid_desc_final = transform_tensor_descriptor(
+                    out_n_hop_wop_k_grid_desc,
+                    make_tuple(make_conv_bwd_data_out_transform(N_,
+                                                                Ho_,
+                                                                Wo_,
+                                                                K_,
+                                                                YDot_,
+                                                                XDot_,
+                                                                HTilde_,
+                                                                WTilde_,
+                                                                ConvDilationH_,
+                                                                ConvDilationW_,
+                                                                HTildeSlice,
+                                                                WTildeSlice,
+                                                                YDotSlice,
+                                                                XDotSlice,
+                                                                IHTildeSliceBegin,
+                                                                IWTildeSliceBegin,
+                                                                GcdStrideDilationH_,
+                                                                GcdStrideDilationW_,
+                                                                AK0,
+                                                                AK1,
+                                                                GemmMPerBlock,
+                                                                GemmKPerBlock)),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
+                    make_tuple(Sequence<0, 1, 2>{}));
+
+                return out_n_hop_wop_k_grid_desc_final;
+#endif
             }
             else if constexpr(NDimSpatial == 3)
             {
diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index 2edbb7c789..c5525d5ff8 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -18,14 +18,13 @@
 #define CK_USE_OCP_FP8 0
 #endif
 
-#if(defined(__gfx942__) || defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx950__)) && \
-    __HIP_DEVICE_COMPILE__
+#if(defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)) && __HIP_DEVICE_COMPILE__
 #define CK_FP8_CVT_FAST_PATH 1
 #else
 #define CK_FP8_CVT_FAST_PATH 0
 #endif
 
-#if(defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx950__)) && __HIP_DEVICE_COMPILE__
+#if(defined(__gfx950__) || defined(__gfx12__)) && __HIP_DEVICE_COMPILE__
 #define CK_OCP_FP8_CVT_FAST_PATH 1
 #else
 #define CK_OCP_FP8_CVT_FAST_PATH 0
@@ -33,8 +32,34 @@
 
 namespace ck {
 
-using f8_fnuz_t  = _BitInt(8);
-using bf8_fnuz_t = unsigned _BitInt(8);
+struct f8_fnuz_t
+{
+    using data_type  = unsigned char;
+    data_type m_data = data_type{};
+    __host__ __device__ explicit constexpr f8_fnuz_t(data_type in_data) : m_data(in_data) {}
+    __host__ __device__ explicit constexpr f8_fnuz_t() = default;
+    __host__ __device__ bool constexpr operator==(f8_fnuz_t other) const
+    {
+        return m_data == other.m_data;
+    }
+    __host__ __device__ explicit constexpr operator data_type() const { return m_data; }
+};
+
+struct bf8_fnuz_t
+{
+    using data_type  = unsigned char;
+    data_type m_data = data_type{};
+    __host__ __device__ explicit constexpr bf8_fnuz_t(data_type in_data) : m_data(in_data) {}
+    __host__ __device__ explicit constexpr bf8_fnuz_t() = default;
+    __host__ __device__ bool constexpr operator==(bf8_fnuz_t other) const
+    {
+        return m_data == other.m_data;
+    }
+    __host__ __device__ explicit constexpr operator data_type() const { return m_data; }
+};
+
+static_assert(1 == sizeof(f8_fnuz_t));
+static_assert(1 == sizeof(bf8_fnuz_t));
 
 typedef unsigned char fp8_storage_t;
 
@@ -364,7 +389,7 @@ struct bf8_ocp_t
     __host__ explicit operator float() const
 #endif
     {
-#if defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx950__) || defined(__gfx12__)
         return fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data);
 #else
         return fp8_impl::cast_from_f8<float, wm, we, false>(
@@ -378,7 +403,7 @@ struct bf8_ocp_t
     __host__ explicit operator _Float16() const
 #endif
     {
-#if defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx950__) || defined(__gfx12__)
         return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data));
 #else
         return fp8_impl::cast_from_f8<_Float16, wm, we, false>(
diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp
index e14c0d62a8..09a462d016 100644
--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -104,7 +104,7 @@ struct intrin_wmma_bf16_16x16x16_bf16_w32<16, 16, Opsel>
         // opsel usage
         // false: D0.[0:15] = result
         // true : D0.[16:31]= result
-#if defined(__gfx11__) || defined(__gfx12__)
+#if defined(__gfx11__)
         reg_c.template AsType<bhalf16_t>()(Number<0>{}) =
             __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(
                 reg_a, reg_b, reg_c.template AsType<bhalf16_t>()[Number<0>{}], Opsel);
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 02a7a72b8c..7ff8e6b057 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -1636,4 +1636,45 @@ struct intrin_mfma_f32_16x16x32bf8f8<16, 16>
     }
 };
 
+/******************* tf32  *************************************/
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x8xf32;
+
+template <>
+struct intrin_mfma_f32_16x16x8xf32<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const float2_t& reg_a, const float2_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx94__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x8_xf32(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x4xf32;
+
+template <>
+struct intrin_mfma_f32_32x32x4xf32<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const float2_t& reg_a, const float2_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx94__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x4_xf32(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], 0, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
 } // namespace ck
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 5fbe30d21b..574269b94a 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
-
+#include <stdint.h>
 #include "ck/utility/amd_ck_fp8.hpp"
 #include "ck/utility/e8m0.hpp"
 #include "ck/utility/statically_indexed_array.hpp"
@@ -26,6 +26,7 @@ using byte = unsigned char;
 using std::byte;
 #endif
 
+using tf32_t  = _BitInt(19); // 1 sign bit, 8 exponent bits, 10 mantissa bits
 using bhalf_t = ushort;
 using half_t  = _Float16;
 using int4_t  = _BitInt(4);
@@ -204,7 +205,7 @@ inline constexpr bool is_native_type()
     return is_same<T, double>::value || is_same<T, float>::value || is_same<T, half_t>::value ||
            is_same<T, bhalf_t>::value || is_same<T, int32_t>::value ||
            is_same<T, uint32_t>::value || is_same<T, int8_t>::value || is_same<T, uint8_t>::value ||
-           is_same<T, f8_fnuz_t>::value || is_same<T, bf8_fnuz_t>::value || is_same<T, bool>::value;
+           is_same_v<T, _BitInt(8)> || is_same_v<T, unsigned _BitInt(8)> || is_same<T, bool>::value;
 }
 
 // scalar_type
@@ -299,14 +300,14 @@ struct scalar_type<pk_i4_t>
 template <>
 struct scalar_type<f8_fnuz_t>
 {
-    using type                           = f8_fnuz_t;
+    using type                           = f8_fnuz_t::data_type;
     static constexpr index_t vector_size = 1;
 };
 
 template <>
 struct scalar_type<bf8_fnuz_t>
 {
-    using type                           = bf8_fnuz_t;
+    using type                           = bf8_fnuz_t::data_type;
     static constexpr index_t vector_size = 1;
 };
 
@@ -324,12 +325,14 @@ struct scalar_type<bf8_ocp_t>
     static constexpr index_t vector_size = 1;
 };
 
+#ifndef CK_CODE_GEN_RTC
 template <>
 struct scalar_type<e8m0_bexp_t>
 {
     using type                           = e8m0_bexp_t::type;
     static constexpr index_t vector_size = 1;
 };
+#endif
 
 template <>
 struct scalar_type<f4x2_pk_t>
@@ -461,4 +464,40 @@ using int64_t = long long;
 using int64_t = long;
 #endif
 
+template <typename T>
+inline const char* get_type_name()
+{
+    if constexpr(is_same_v<T, half_t>)
+        return "fp16";
+    else if constexpr(is_same_v<T, bhalf_t>)
+        return "bf16";
+    else if constexpr(is_same_v<T, tf32_t>)
+        return "tf32";
+    else if constexpr(is_same_v<T, int4_t>)
+        return "int4";
+    else if constexpr(is_same_v<T, f4_t>)
+        return "f4";
+    else if constexpr(is_same_v<T, f6_t>)
+        return "f6";
+    else if constexpr(is_same_v<T, bf6_t>)
+        return "bf6";
+    else if constexpr(is_same_v<T, f8_t>)
+        return "f8";
+    else if constexpr(is_same_v<T, bf8_t>)
+        return "bf8";
+#ifndef CK_CODE_GEN_RTC
+    else if constexpr(is_same_v<T, e8m0_bexp_t>)
+        return "e8m0";
+#endif
+    else if constexpr(is_same_v<T, float>)
+        return "fp32";
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
+    else
+        return "unknown";
+#else
+    else
+        return typeid(T).name();
+#endif
+}
+
 } // namespace ck
diff --git a/include/ck/utility/debug.hpp b/include/ck/utility/debug.hpp
index 45d443ae49..1b86b33777 100644
--- a/include/ck/utility/debug.hpp
+++ b/include/ck/utility/debug.hpp
@@ -13,7 +13,7 @@ template <typename T, typename Enable = void>
 struct PrintAsType;
 
 template <typename T>
-struct PrintAsType<T, typename std::enable_if<std::is_floating_point<T>::value>::type>
+struct PrintAsType<T, typename enable_if<is_floating_point<T>::value>::type>
 {
     using type = float;
     __host__ __device__ static void Print(const T& p) { printf("%.3f ", static_cast<type>(p)); }
@@ -30,7 +30,7 @@ struct PrintAsType<ck::half_t, void>
 };
 
 template <typename T>
-struct PrintAsType<T, typename std::enable_if<std::is_integral<T>::value>::type>
+struct PrintAsType<T, typename enable_if<is_integral<T>::value>::type>
 {
     using type = int;
     __host__ __device__ static void Print(const T& p) { printf("%d ", static_cast<type>(p)); }
diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp
index ae0edb35ee..084240f84b 100644
--- a/include/ck/utility/dtype_vector.hpp
+++ b/include/ck/utility/dtype_vector.hpp
@@ -1294,11 +1294,25 @@ struct nnvb_data_t_selector<bf8_ocp_t>
     using type = bf8_ocp_t::data_type;
 };
 
+#ifndef CK_CODE_GEN_RTC
+template <>
+struct nnvb_data_t_selector<f8_fnuz_t>
+{
+    using type = f8_fnuz_t::data_type;
+};
+
+template <>
+struct nnvb_data_t_selector<bf8_fnuz_t>
+{
+    using type = bf8_fnuz_t::data_type;
+};
+
 template <>
 struct nnvb_data_t_selector<e8m0_bexp_t>
 {
     using type = e8m0_bexp_t::type;
 };
+#endif
 
 template <>
 struct nnvb_data_t_selector<f6x16_pk_t>
@@ -2258,8 +2272,10 @@ using bf6x16_t   = typename vector_type<bf6x16_pk_t, 1>::type;
 using bf6x16x2_t = typename vector_type<bf6x16_pk_t, 2>::type;
 using bf6x32_t   = typename vector_type<bf6x32_pk_t, 1>::type;
 
+#ifndef CK_CODE_GEN_RTC
 // e8m0
 using e8m0x4_bexp_t = typename vector_type<e8m0_bexp_t, 4>::type;
+#endif
 
 // pack int4
 using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
diff --git a/include/ck/utility/e8m0.hpp b/include/ck/utility/e8m0.hpp
index f7d2a2f594..ac2a114593 100644
--- a/include/ck/utility/e8m0.hpp
+++ b/include/ck/utility/e8m0.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#ifndef CK_CODE_GEN_RTC
 #include "ck/utility/type.hpp"
 
 namespace ck {
@@ -78,3 +79,4 @@ __host__ __device__ inline constexpr int32_t get_exponent_value<e8m0_bexp_t>(e8m
 } // namespace utils
 
 } // namespace ck
+#endif
diff --git a/include/ck/utility/f8_utils.hpp b/include/ck/utility/f8_utils.hpp
index 799683ae65..94c2f84c8c 100644
--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -39,7 +39,7 @@ __host__ __device__ Y run_cast_to_f8(X x, uint32_t rng)
     int exponent, bias;
     uint32_t head, mantissa, sign;
     // nan code is same for float and half
-    constexpr Y nan_code        = 0x80;
+    constexpr uint8_t nan_code  = 0x80;
     constexpr uint32_t nan_mask = NumericUtils<X>::nan_mask;
 
     // convert to bitwise
@@ -60,17 +60,17 @@ __host__ __device__ Y run_cast_to_f8(X x, uint32_t rng)
     if constexpr(negative_zero_nan)
     {
         if((x_bitwise & nan_mask) == nan_mask)
-            return nan_code;
+            return Y{nan_code};
     }
     else
     {
         if((x_bitwise & nan_mask) == nan_mask)
-            return signed_inf + (mantissa != 0 ? 1 : 0);
+            return Y{static_cast<uint8_t>(signed_inf + (mantissa != 0 ? 1 : 0))};
     }
 
     // check if x is 0.0
     if(x_bitwise == 0)
-        return 0;
+        return Y{0};
 
     // First need to check if it is normal or denorm as there is a difference of implict 1
     // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
@@ -178,9 +178,10 @@ In this case, the fp16 mantissa should be shift left by 1 */
 
     // check if x is 0.0 or -0.0
     if(out_exponent == 0 && mantissa == 0)
-        return negative_zero_nan ? 0 : (sign << (out_exp + out_mant));
+        return Y{negative_zero_nan ? 0 : static_cast<uint8_t>(sign << (out_exp + out_mant))};
     mantissa &= (1 << out_mant) - 1;
-    return (sign << (out_exp + out_mant)) | (out_exponent << out_mant) | mantissa;
+    return Y{static_cast<uint8_t>((sign << (out_exp + out_mant)) | (out_exponent << out_mant) |
+                                  mantissa)};
 }
 
 template <typename X, typename Y, bool negative_zero_nan>
@@ -195,8 +196,8 @@ __host__ __device__ Y run_cast_from_f8(X x)
     constexpr int out_mant = NumericUtils<Y>::mant;
 
     // prepare the codes
-    constexpr X nan_code = 0x80;
-    using T_bitwise      = typename NumericUtils<Y>::bitwise_type;
+    constexpr uint8_t nan_code = 0x80;
+    using T_bitwise            = typename NumericUtils<Y>::bitwise_type;
 
     constexpr T_bitwise Inf_bitwise    = NumericUtils<Y>::Inf;
     constexpr T_bitwise NegInf_bitwise = NumericUtils<Y>::NegInf;
@@ -209,13 +210,13 @@ __host__ __device__ Y run_cast_from_f8(X x)
     constexpr Y Neg0   = bit_cast<Y>(Neg0_bitwise);
 
     // check if x is 0.0
-    if(x == 0)
+    if(!static_cast<uint8_t>(x))
         return static_cast<Y>(0);
 
     // unpack the input
-    uint32_t sign     = x >> (in_exp + in_mant);
-    uint32_t mantissa = x & ((1 << in_mant) - 1);
-    int exponent      = (x & 0x7F) >> in_mant;
+    uint32_t sign     = static_cast<uint8_t>(x) >> (in_exp + in_mant);
+    uint32_t mantissa = static_cast<uint8_t>(x) & ((1 << in_mant) - 1);
+    int exponent      = (static_cast<uint8_t>(x) & 0x7F) >> in_mant;
 
     constexpr int exp_low_cutoff =
         (1 << (out_exp - 1)) - (1 << (in_exp - 1)) + 1 - (negative_zero_nan ? 1 : 0);
@@ -223,12 +224,12 @@ __host__ __device__ Y run_cast_from_f8(X x)
 
     if constexpr(negative_zero_nan)
     {
-        if(x == nan_code)
+        if(static_cast<uint8_t>(x) == nan_code)
             return NaN;
     }
     else
     {
-        if(x == nan_code)
+        if(static_cast<uint8_t>(x) == nan_code)
             return Neg0;
         if(exponent == ((1 << in_exp) - 1))
             return (mantissa == 0) ? (sign ? NegInf : Inf) : NaN;
@@ -272,8 +273,8 @@ template <typename X, typename Y, bool negative_zero_nan, bool clip, bool stoch>
 __host__ __device__ Y cast_to_f8(X x, uint32_t rng)
 {
     // check datatypes
-    constexpr bool is_half  = std::is_same<X, half_t>::value;
-    constexpr bool is_float = std::is_same<X, float>::value;
+    constexpr bool is_half  = is_same<X, half_t>::value;
+    constexpr bool is_float = is_same<X, float>::value;
     static_assert(is_half || is_float, "Only half and float can be casted.");
 
     return run_cast_to_f8<X, Y, negative_zero_nan, clip, stoch>(x, rng);
@@ -283,8 +284,8 @@ template <typename X, typename Y, bool negative_zero_nan>
 __host__ __device__ Y cast_from_f8(X x)
 {
     // check datatype
-    constexpr bool is_half  = std::is_same<Y, half_t>::value;
-    constexpr bool is_float = std::is_same<Y, float>::value;
+    constexpr bool is_half  = is_same<Y, half_t>::value;
+    constexpr bool is_float = is_same<Y, float>::value;
     static_assert(is_half || is_float, "only half and float are supported.");
 
     return run_cast_from_f8<X, Y, negative_zero_nan>(x);
diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp
index 53e865767b..c96a6c3aef 100644
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
@@ -7,7 +7,6 @@
 
 namespace ck {
 
-#if defined(CK_ENABLE_DYNAMIC_WARP_SIZE)
 __device__ constexpr index_t get_warp_size()
 {
 #if defined(__HIP_DEVICE_COMPILE__)
@@ -38,16 +37,6 @@ inline __host__ index_t get_warp_size()
 #endif
     return 64;
 }
-#else
-__host__ __device__ constexpr index_t get_warp_size()
-{
-#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
-    return 64;
-#else
-    return 32;
-#endif
-}
-#endif
 
 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
 
diff --git a/include/ck/utility/magic_division.hpp b/include/ck/utility/magic_division.hpp
index 993b70a3fb..7227cee754 100644
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -10,10 +10,6 @@
 #include "type.hpp"
 #include "tuple.hpp"
 
-#ifdef CK_CODE_GEN_RTC
-#define INT32_MAX 2147483647
-#endif
-
 namespace ck {
 
 // magic number division
diff --git a/include/ck/utility/numeric_limits.hpp b/include/ck/utility/numeric_limits.hpp
index e59b7eceaf..b8d6280acc 100644
--- a/include/ck/utility/numeric_limits.hpp
+++ b/include/ck/utility/numeric_limits.hpp
@@ -522,8 +522,6 @@ struct NumericLimits<bf6_t>
     }
 };
 
-#endif
-
 template <>
 struct NumericLimits<e8m0_bexp_t>
 {
@@ -551,5 +549,6 @@ struct NumericLimits<e8m0_bexp_t>
         return e8m0_bexp_t(binary_142);
     }
 };
+#endif
 
 } // namespace ck
diff --git a/include/ck/utility/numeric_utils.hpp b/include/ck/utility/numeric_utils.hpp
index 726f667518..399bc0c3e8 100644
--- a/include/ck/utility/numeric_utils.hpp
+++ b/include/ck/utility/numeric_utils.hpp
@@ -10,6 +10,7 @@ struct NumericUtils
 {
 };
 
+#ifndef CK_CODE_GEN_RTC
 template <>
 struct NumericUtils<e8m0_bexp_t>
 {
@@ -24,6 +25,7 @@ struct NumericUtils<e8m0_bexp_t>
 
     using bitwise_type = uint8_t;
 };
+#endif
 
 template <>
 struct NumericUtils<float>
diff --git a/include/ck/utility/random_gen.hpp b/include/ck/utility/random_gen.hpp
index c37d3922ca..dd2662b6d9 100644
--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 #include <ck/utility/ignore.hpp>
+#include <ck/utility/type.hpp>
 #include "ck/ck.hpp"
 
 #ifdef CK_CODE_GEN_RTC
@@ -14,10 +15,10 @@ namespace ck {
 
 // Pseudo random number generator
 // version for fp32
-template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<float, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, ck::enable_if_t<is_same<float, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
-    uint32_t x         = *(reinterpret_cast<uint32_t*>(&val));
+    uint32_t x         = bit_cast<uint32_t>(val);
     uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
     drop_bits ^= x >> 16;
     drop_bits = ((drop_bits & 31) << 11) | (drop_bits >> 5);
@@ -30,10 +31,10 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }
 
 // version for fp16
-template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<_Float16, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, ck::enable_if_t<is_same<_Float16, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
-    uint16_t x         = *(reinterpret_cast<uint16_t*>(&val));
+    uint16_t x         = bit_cast<uint16_t>(val);
     uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
     drop_bits          = ((drop_bits & 31) << 11) | (drop_bits >> 5);
     drop_bits *= 0x7000149;
@@ -47,7 +48,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 // return 0 if data is not fp16 or fp32
 template <typename T,
           uint32_t seed_t,
-          ck::enable_if_t<!(std::is_same<float, T>{} || std::is_same<_Float16, T>{}), bool> = false>
+          ck::enable_if_t<!(is_same<float, T>{} || is_same<_Float16, T>{}), bool> = false>
 __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
 {
     ck::ignore = id;
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 99538ac78c..701b2686c7 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -187,12 +187,24 @@ inline __host__ __device__ constexpr bf8_ocp_t type_convert<bf8_ocp_t, int>(int
     return bf8_ocp_t{type_convert<bf8_ocp_t::data_type>(x)};
 }
 
+template <typename Y, enable_if_t<is_same_v<Y, ck::tf32_t>, bool> = false>
+inline __host__ __device__ constexpr float type_convert(float x)
+{
+    union
+    {
+        float fp32;
+        uint32_t int32;
+    } u = {x};
+
+    u.int32 = u.int32 & 0xffffe000;
+    return u.fp32;
+}
+
 // Convert X to Y
 template <typename Y, typename X>
 __host__ __device__ constexpr Y type_convert_sp(X x)
 {
     static_assert(!ck::is_reference_v<Y> && !ck::is_reference_v<X>);
-
     return static_cast<Y>(x);
 }
 
@@ -244,6 +256,65 @@ inline __host__ __device__ constexpr half_t type_convert_sp<half_t, int>(int x)
     return u.fp16;
 }
 
+template <>
+inline __host__ __device__ constexpr int type_convert_sp<int, f8_t>(f8_t x)
+{
+    union
+    {
+        f8_t fp8;
+        int int32;
+    } u = {x};
+
+    return u.int32;
+}
+
+template <>
+inline __host__ __device__ constexpr f8_t type_convert_sp<f8_t, int>(int x)
+{
+    union
+    {
+        int int32;
+        f8_t fp8;
+    } u = {x};
+
+    return u.fp8;
+}
+
+template <>
+inline __host__ __device__ constexpr int type_convert_sp<int, bhalf_t>(bhalf_t x)
+{
+    union
+    {
+        bhalf_t fp16;
+        int int32;
+    } u = {x};
+
+    return u.int32;
+}
+
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert_sp<bhalf_t, int>(int x)
+{
+    union
+    {
+        int int32;
+        bhalf_t fp16;
+    } u = {x};
+
+    return u.fp16;
+}
+
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert_sp<bhalf_t, float>(float x)
+{
+    return type_convert<bhalf_t>(x);
+}
+
+template <>
+inline __host__ __device__ constexpr half_t type_convert_sp<half_t, float>(float x)
+{
+    return type_convert<half_t>(x);
+}
 // Declare a template function for fp8 conversion using SR
 template <typename Y, typename X>
 __host__ __device__ constexpr Y f8_convert_sr(X x);
@@ -280,7 +351,7 @@ inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, float>(float x)
         val.fval = __builtin_amdgcn_fmed3f(val.fval, max_fp8, -max_fp8);
     ival       = __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0); // 0 pos
     val.i32val = ival;
-    return val.i8val[0]; // little endian
+    return f8_fnuz_t{val.i8val[0]}; // little endian
 #else
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
@@ -348,7 +419,7 @@ inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, float>(float x)
         val.fval = __builtin_amdgcn_fmed3f(val.fval, max_bf8, -max_bf8);
     ival       = __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
     val.i32val = ival;
-    return val.i8val[0]; // little endian
+    return bf8_fnuz_t{val.i8val[0]}; // little endian
 #else
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
@@ -584,7 +655,7 @@ inline __host__ __device__ f8_fnuz_t f8_convert_rne<f8_fnuz_t, float>(float x)
         val.fval = __builtin_amdgcn_fmed3f(val.fval, max_fp8, -max_fp8);
     ival       = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false); // false -> WORD0
     val.i32val = ival;
-    return val.i8val[0];
+    return f8_fnuz_t{val.i8val[0]};
 #else
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
@@ -636,7 +707,7 @@ inline __host__ __device__ bf8_fnuz_t f8_convert_rne<bf8_fnuz_t, float>(float x)
         val.fval = __builtin_amdgcn_fmed3f(val.fval, max_bf8, -max_bf8);
     ival       = __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false); // false -> WORD0
     val.i32val = ival;
-    return val.i8val[0];
+    return bf8_fnuz_t{val.i8val[0]};
 #else
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
@@ -853,7 +924,7 @@ inline __host__ __device__ float type_convert<float, f8_fnuz_t>(f8_fnuz_t x)
 {
 #if defined(__gfx94__)
     float fval;
-    uint32_t i32val = static_cast<uint32_t>(x);
+    uint32_t i32val = static_cast<uint32_t>(static_cast<uint8_t>(x));
     fval            = __builtin_amdgcn_cvt_f32_fp8(i32val, 0);
     // asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
     return fval;
@@ -917,7 +988,7 @@ inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_
 #if CK_OCP_FP8_CVT_FAST_PATH
 // __builtin_amdgcn_cvt_pk_f32_fp8 can produce incorrect results due to a compiler issue.
 // TODO: Enable when SWDEV-532959 is fixed.
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx12__)
     return float2_t{__builtin_amdgcn_cvt_f32_fp8(bit_cast<uint16_t>(x), 0),
                     __builtin_amdgcn_cvt_f32_fp8(bit_cast<uint16_t>(x), 1)};
 #else
@@ -1060,7 +1131,7 @@ inline __host__ __device__ float2_t type_convert<float2_t, bf8x2_ocp_t>(bf8x2_oc
 #if CK_OCP_FP8_CVT_FAST_PATH
 // __builtin_amdgcn_cvt_pk_f32_bf8 can produce incorrect results due to a compiler issue.
 // TODO: Enable when SWDEV-532959 is fixed.
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx12__)
     return float2_t{__builtin_amdgcn_cvt_f32_bf8(bit_cast<uint16_t>(x), 0),
                     __builtin_amdgcn_cvt_f32_bf8(bit_cast<uint16_t>(x), 1)};
 #else
@@ -1359,7 +1430,7 @@ inline __host__ __device__ float type_convert<float, bf8_fnuz_t>(bf8_fnuz_t x)
 {
 #if defined(__gfx94__)
     float fval;
-    uint32_t i32val = static_cast<uint32_t>(x);
+    uint32_t i32val = static_cast<uint32_t>(static_cast<uint8_t>(x));
     fval            = __builtin_amdgcn_cvt_f32_bf8(i32val, 0);
     // asm volatile("v_cvt_f32_bf8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
     return fval;
diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
index 1f6c389090..c96daf3e99 100644
--- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
+++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
@@ -104,7 +104,7 @@ enum struct tile_distribution_pattern
     block_raked,
 };
 
-struct TileDistributionEncodingPattern
+struct tile_distribution_encoding_pattern
 {
 };
 
@@ -126,7 +126,7 @@ template <index_t BlockSize,
           index_t VecSize,
           tile_distribution_pattern DistributionPattern,
           index_t NumWaveGroups = 1>
-struct TileDistributionEncodingPattern2D : public TileDistributionEncodingPattern
+struct tile_distribution_encoding_pattern_2d : public tile_distribution_encoding_pattern
 {
 };
 
@@ -136,12 +136,13 @@ template <index_t BlockSize,
           index_t XPerTile,
           index_t VecSize,
           index_t NumWaveGroups>
-struct TileDistributionEncodingPattern2D<BlockSize,
-                                         YPerTile,
-                                         XPerTile,
-                                         VecSize,
-                                         tile_distribution_pattern::thread_raked,
-                                         NumWaveGroups> : public TileDistributionEncodingPattern
+struct tile_distribution_encoding_pattern_2d<BlockSize,
+                                             YPerTile,
+                                             XPerTile,
+                                             VecSize,
+                                             tile_distribution_pattern::thread_raked,
+                                             NumWaveGroups>
+    : public tile_distribution_encoding_pattern
 {
 
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
@@ -165,7 +166,7 @@ struct TileDistributionEncodingPattern2D<BlockSize,
                   "X0 * warp_ys * Y0 must cover whole workgroup!");
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
 
-    CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
     {
         if constexpr(NumWaveGroups != 1)
         {
@@ -189,7 +190,7 @@ struct TileDistributionEncodingPattern2D<BlockSize,
         }
     }
 
-    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto make_shuffled_2d_static_tile_distribution()
     {
         if constexpr(NumWaveGroups != 1)
         {
@@ -220,12 +221,13 @@ template <index_t BlockSize,
           index_t XPerTile,
           index_t VecSize,
           index_t NumWaveGroups>
-struct TileDistributionEncodingPattern2D<BlockSize,
-                                         YPerTile,
-                                         XPerTile,
-                                         VecSize,
-                                         tile_distribution_pattern::warp_raked,
-                                         NumWaveGroups> : public TileDistributionEncodingPattern
+struct tile_distribution_encoding_pattern_2d<BlockSize,
+                                             YPerTile,
+                                             XPerTile,
+                                             VecSize,
+                                             tile_distribution_pattern::warp_raked,
+                                             NumWaveGroups>
+    : public tile_distribution_encoding_pattern
 {
 
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
@@ -244,7 +246,7 @@ struct TileDistributionEncodingPattern2D<BlockSize,
     static constexpr index_t Y1 = YPerTile / (Y2 * Y0); // # of iters within wavefront
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
 
-    CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
@@ -255,7 +257,7 @@ struct TileDistributionEncodingPattern2D<BlockSize,
                                        sequence<1, 1>>{}); // -> <Y1, X1>
     }
 
-    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto make_shuffled_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
@@ -273,12 +275,13 @@ template <index_t BlockSize,
           index_t XPerTile,
           index_t VecSize,
           index_t NumWaveGroups>
-struct TileDistributionEncodingPattern2D<BlockSize,
-                                         YPerTile,
-                                         XPerTile,
-                                         VecSize,
-                                         tile_distribution_pattern::block_raked,
-                                         NumWaveGroups> : public TileDistributionEncodingPattern
+struct tile_distribution_encoding_pattern_2d<BlockSize,
+                                             YPerTile,
+                                             XPerTile,
+                                             VecSize,
+                                             tile_distribution_pattern::block_raked,
+                                             NumWaveGroups>
+    : public tile_distribution_encoding_pattern
 {
 
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
@@ -295,7 +298,7 @@ struct TileDistributionEncodingPattern2D<BlockSize,
     static constexpr index_t Y0 = YPerTile / (Y2 * Y1); // # of iters
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
 
-    CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
@@ -306,7 +309,7 @@ struct TileDistributionEncodingPattern2D<BlockSize,
                                        sequence<0, 1>>{}); // -> <Y0, X1>
     }
 
-    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto make_shuffled_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
@@ -336,21 +339,21 @@ template <index_t BlockSize,
           index_t VecSize,
           tile_distribution_pattern DistributionPattern,
           index_t NumWaveGroups>
-CK_TILE_HOST_DEVICE void print(const TileDistributionEncodingPattern2D<BlockSize,
-                                                                       YPerTile,
-                                                                       XPerTile,
-                                                                       VecSize,
-                                                                       DistributionPattern,
-                                                                       NumWaveGroups>&)
+CK_TILE_HOST_DEVICE void print(const tile_distribution_encoding_pattern_2d<BlockSize,
+                                                                           YPerTile,
+                                                                           XPerTile,
+                                                                           VecSize,
+                                                                           DistributionPattern,
+                                                                           NumWaveGroups>&)
 {
-    using PatternType = TileDistributionEncodingPattern2D<BlockSize,
-                                                          YPerTile,
-                                                          XPerTile,
-                                                          VecSize,
-                                                          DistributionPattern,
-                                                          NumWaveGroups>;
+    using PatternType = tile_distribution_encoding_pattern_2d<BlockSize,
+                                                              YPerTile,
+                                                              XPerTile,
+                                                              VecSize,
+                                                              DistributionPattern,
+                                                              NumWaveGroups>;
 
-    printf("TileDistributionEncodingPattern2D<BlockSize:%d, YPerTile:%d, XPerTile:%d, "
+    printf("tile_distribution_encoding_pattern_2d<BlockSize:%d, YPerTile:%d, XPerTile:%d, "
            "VecSize:%d, %s>: ",
            BlockSize,
            YPerTile,
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 7a9c017eb2..ff1367a9c1 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -28,6 +28,60 @@ using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
 
 namespace ck_tile {
 
+// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the
+// memory to the SGPR registers.
+__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v)
+{
+    return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
+}
+
+__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v)
+{
+    return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
+}
+
+__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value)
+{
+    return __builtin_amdgcn_readfirstlane(value);
+}
+
+__device__ inline int32_t amd_wave_read_first_lane(int32_t value)
+{
+    return __builtin_amdgcn_readfirstlane(value);
+}
+
+template <typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>, int> = 0>
+__device__ inline auto amd_wave_read_first_lane(const Object& obj)
+{
+    constexpr size_t ObjectSize = sizeof(Object);
+    constexpr size_t SGPR_size  = 4;
+    constexpr size_t NumFull    = ObjectSize / SGPR_size;
+    constexpr size_t Tail       = ObjectSize % SGPR_size;
+
+    const unsigned char* src = reinterpret_cast<const unsigned char*>(&obj);
+    alignas(Object) unsigned char dst[ObjectSize];
+
+    static_for<0, NumFull, 1>{}([&](auto Ic) {
+        constexpr size_t offset = Ic * SGPR_size;
+        uint32_t read_src;
+        __builtin_memcpy(&read_src, src + offset, SGPR_size);
+        read_src = __builtin_amdgcn_readfirstlane(read_src);
+        __builtin_memcpy(dst + offset, &read_src, SGPR_size);
+    });
+
+    if constexpr(Tail != 0)
+    {
+        constexpr size_t offset = NumFull * SGPR_size;
+        uint32_t tail_loc       = 0;
+        __builtin_memcpy(&tail_loc, src + offset, Tail);
+        tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
+        __builtin_memcpy(dst + offset, &tail_loc, Tail);
+    }
+    Object out;
+    __builtin_memcpy(&out, dst, ObjectSize);
+    return out;
+}
+
 // 128 bit SGPRs to supply buffer resource in buffer instructions
 // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
 struct __attribute__((packed)) buffer_resource
@@ -37,10 +91,17 @@ struct __attribute__((packed)) buffer_resource
     uint32_t config;
 };
 
-CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff)
+template <typename ForceSGPR = std::false_type>
+CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr,
+                                                   uint32_t size = 0xffffffff,
+                                                   ForceSGPR     = {})
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+    if constexpr(std::is_same_v<ForceSGPR, std::true_type>)
+    {
+        r = amd_wave_read_first_lane(r);
+    }
     return r;
 }
 
@@ -470,7 +531,7 @@ struct buffer_store<16>
                                    index_t /*flag*/ = 1)
     {
         static_assert(sizeof(T) == 16);
-        using mbuf_t = fp32x4_t;
+        using mbuf_t = uint32x4_t;
 #if HAS_RAW_BUFFER_BUILTINS
         index_t s_offset = i_offset;
         __builtin_amdgcn_raw_buffer_store_b128(
@@ -496,7 +557,7 @@ struct buffer_store<8>
                                    index_t /*flag*/ = 1)
     {
         static_assert(sizeof(T) == 8);
-        using mbuf_t = fp32x2_t;
+        using mbuf_t = uint32x2_t;
 #if HAS_RAW_BUFFER_BUILTINS
         index_t s_offset = i_offset;
         __builtin_amdgcn_raw_buffer_store_b64(
@@ -522,7 +583,7 @@ struct buffer_store<4>
                                    index_t /*flag*/ = 1)
     {
         static_assert(sizeof(T) == 4);
-        using mbuf_t = float;
+        using mbuf_t = uint32_t;
 #if HAS_RAW_BUFFER_BUILTINS
         index_t s_offset = i_offset;
         __builtin_amdgcn_raw_buffer_store_b32(
@@ -548,7 +609,7 @@ struct buffer_store<2>
                                    index_t /*flag*/ = 1)
     {
         static_assert(sizeof(T) == 2);
-        using mbuf_t = short;
+        using mbuf_t = uint16_t;
 #if HAS_RAW_BUFFER_BUILTINS
         index_t s_offset = i_offset;
         __builtin_amdgcn_raw_buffer_store_b16(
@@ -573,8 +634,8 @@ struct buffer_store<1>
                                    index_t i_offset /*max 0xFFF*/,
                                    index_t /*flag*/ = 1)
     {
-        static_assert(sizeof(T) == 4);
-        using mbuf_t = float;
+        static_assert(sizeof(T) == 1);
+        using mbuf_t = uint8_t;
 #if HAS_RAW_BUFFER_BUILTINS
         index_t s_offset = i_offset;
         __builtin_amdgcn_raw_buffer_store_b8(
@@ -2788,7 +2849,7 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 }
 
 #if defined(__gfx950__)
-template <typename T, index_t N, address_space_enum BufferAddressSpace>
+template <typename T, index_t N>
 __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 #define __LDS_ADDR __attribute__((address_space(3)))
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 4013b51479..38e033cd92 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -19,6 +19,60 @@ using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
 
 namespace ck_tile {
 
+// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the
+// memory to the SGPR registers.
+__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v)
+{
+    return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
+}
+
+__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v)
+{
+    return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
+}
+
+__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value)
+{
+    return __builtin_amdgcn_readfirstlane(value);
+}
+
+__device__ inline int32_t amd_wave_read_first_lane(int32_t value)
+{
+    return __builtin_amdgcn_readfirstlane(value);
+}
+
+template <typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>, int> = 0>
+__device__ inline auto amd_wave_read_first_lane(const Object& obj)
+{
+    constexpr size_t ObjectSize = sizeof(Object);
+    constexpr size_t SGPR_size  = 4;
+    constexpr size_t NumFull    = ObjectSize / SGPR_size;
+    constexpr size_t Tail       = ObjectSize % SGPR_size;
+
+    const unsigned char* src = reinterpret_cast<const unsigned char*>(&obj);
+    alignas(Object) unsigned char dst[ObjectSize];
+
+    static_for<0, NumFull, 1>{}([&](auto Ic) {
+        constexpr size_t offset = Ic * SGPR_size;
+        uint32_t read_src;
+        __builtin_memcpy(&read_src, src + offset, SGPR_size);
+        read_src = __builtin_amdgcn_readfirstlane(read_src);
+        __builtin_memcpy(dst + offset, &read_src, SGPR_size);
+    });
+
+    if constexpr(Tail != 0)
+    {
+        constexpr size_t offset = NumFull * SGPR_size;
+        uint32_t tail_loc       = 0;
+        __builtin_memcpy(&tail_loc, src + offset, Tail);
+        tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
+        __builtin_memcpy(dst + offset, &tail_loc, Tail);
+    }
+    Object out;
+    __builtin_memcpy(&out, dst, ObjectSize);
+    return out;
+}
+
 // 128 bit SGPRs to supply buffer resource in buffer instructions
 // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
 struct __attribute__((packed)) buffer_resource
@@ -28,10 +82,17 @@ struct __attribute__((packed)) buffer_resource
     uint32_t config;
 };
 
-CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff)
+template <typename ForceSGPR = std::false_type>
+CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr,
+                                                   uint32_t size = 0xffffffff,
+                                                   ForceSGPR     = {})
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+    if constexpr(std::is_same_v<ForceSGPR, std::true_type>)
+    {
+        r = amd_wave_read_first_lane(r);
+    }
     return r;
 }
 
@@ -1335,8 +1396,10 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
     static_assert(
         (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
             (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, fp16_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
+            (std::is_same<T, bf16_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
             (std::is_same<T, int32_t>::value &&
              (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
@@ -1449,14 +1512,19 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
                                                    src_wave_addr_offset,
                                                    static_cast<index_t>(coherence)));
         }
-        else if constexpr(N == 8)
+        else
         {
-            // use fp32 load to mimic fp16 load
-            fp32x4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                              src_thread_addr_offset,
-                                                              src_wave_addr_offset,
-                                                              static_cast<index_t>(coherence));
+            // N >= 8: build from fp32x4 chunks
+            thread_buffer<float, N / 2> tmp;
 
+            static_for<0, (N / 8), 1>{}([&](auto i) {
+                constexpr index_t chunk            = i;
+                tmp.template get_as<fp32x4_t>()(i) = llvm_amdgcn_raw_buffer_load_fp32x4(
+                    src_wave_buffer_resource,
+                    src_thread_addr_offset,
+                    src_wave_addr_offset + (chunk * 4) * sizeof(float),
+                    static_cast<index_t>(coherence));
+            });
             return bit_cast<rtn_type>(tmp);
         }
     }
@@ -1486,13 +1554,19 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
                                                   src_wave_addr_offset,
                                                   static_cast<index_t>(coherence)));
         }
-        else if constexpr(N == 8)
+        else
         {
-            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                                              src_thread_addr_offset,
-                                                              src_wave_addr_offset,
-                                                              static_cast<index_t>(coherence));
+            // N >= 8: build from fp32x4 chunks
+            thread_buffer<float, N / 2> tmp;
 
+            static_for<0, (N / 8), 1>{}([&](auto i) {
+                constexpr index_t chunk            = i;
+                tmp.template get_as<fp32x4_t>()(i) = llvm_amdgcn_raw_buffer_load_fp32x4(
+                    src_wave_buffer_resource,
+                    src_thread_addr_offset,
+                    src_wave_addr_offset + (chunk * 4) * sizeof(float),
+                    static_cast<index_t>(coherence));
+            });
             return bit_cast<rtn_type>(tmp);
         }
     }
@@ -2572,9 +2646,8 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
     const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
 
 #if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
-    T* lds_ptr = lds_base_ptr + lds_offset;
-    auto const lds_ptr_sgpr =
-        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
+    T* lds_ptr              = lds_base_ptr + lds_offset;
+    auto const lds_ptr_sgpr = amd_wave_read_first_lane((reinterpret_cast<uintptr_t>(lds_ptr)));
     asm volatile("s_mov_b32 m0, %0; \n\t"
                  "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
                  "v"(global_offset_bytes),
@@ -2606,7 +2679,7 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 }
 
 #if defined(__gfx950__)
-template <typename T, index_t N, address_space_enum BufferAddressSpace>
+template <typename T, index_t N>
 __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 #define __LDS_ADDR __attribute__((address_space(3)))
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 42f2390cde..28ded5439a 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -9,6 +9,8 @@
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
+#include "ck_tile/core/arch/amd_buffer_addressing.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
 
 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
@@ -104,7 +106,7 @@ CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {})
     const index_t warp_id = threadIdx.x / get_warp_size();
     if constexpr(ReturnSgpr)
     {
-        return __builtin_amdgcn_readfirstlane(warp_id);
+        return amd_wave_read_first_lane(warp_id);
     }
     else
     {
diff --git a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
index c02c46958c..e56bcadcba 100644
--- a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
+++ b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
@@ -359,7 +359,7 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
 {
     static_assert((std::is_same<T, int32_t>::value && (N == 1)) ||
                       (std::is_same<T, uint32_t>::value && (N == 1)) ||
-                      (std::is_same<T, float>::value && (N == 1 || N == 2)) ||
+                      (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
                       (std::is_same<T, double>::value && (N == 1 || N == 2)) ||
                       (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
                       (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
@@ -369,6 +369,8 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
 
     constexpr auto I0 = number<0>{};
     constexpr auto I1 = number<1>{};
+    constexpr auto I2 = number<2>{};
+    constexpr auto I3 = number<3>{};
 
     if constexpr(std::is_same<T, float>::value)
     {
@@ -381,6 +383,13 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
             atomicAdd(c_style_pointer_cast<float*>(p_dst), x.template get_as<float>()[I0]);
             atomicAdd(c_style_pointer_cast<float*>(p_dst) + 1, x.template get_as<float>()[I1]);
         }
+        else if constexpr(N == 4)
+        {
+            atomicAdd(c_style_pointer_cast<float*>(p_dst), x.template get_as<float>()[I0]);
+            atomicAdd(c_style_pointer_cast<float*>(p_dst) + 1, x.template get_as<float>()[I1]);
+            atomicAdd(c_style_pointer_cast<float*>(p_dst) + 2, x.template get_as<float>()[I2]);
+            atomicAdd(c_style_pointer_cast<float*>(p_dst) + 3, x.template get_as<float>()[I3]);
+        }
     }
     else if constexpr(std::is_same<T, double>::value)
     {
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 7b5b862cb1..a7fe6b37e1 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -3,10 +3,11 @@
 
 #pragma once
 
-#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__) || \
+    defined(__gfx9_4_generic__)
 #define __gfx9__
 #endif
-#if defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx9_4_generic__)
 #define __gfx94__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index 905b32dd15..cfec2237f9 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -175,6 +175,9 @@ struct sequence
         return sequence<type::get(number<Ids>{})...>{};
     }
 
+    CK_TILE_HOST_DEVICE static constexpr auto sum() { return (Is + ... + 0); }
+    CK_TILE_HOST_DEVICE static constexpr auto product() { return (Is * ... * 1); }
+
     // modify element at index "I" with value "X"
     template <index_t I, index_t X>
     CK_TILE_HOST_DEVICE static constexpr auto modify(number<I>, number<X>)
diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp
index 245fb7244f..e709fed23d 100644
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -117,12 +117,8 @@ using bf16_raw_t = uint16_t;
 CK_TILE_HOST_DEVICE
 constexpr uint16_t float_to_bf16_rtn_raw(float f)
 {
-    union
-    {
-        float fp32;
-        uint32_t int32;
-    } u = {f};
-    if(~u.int32 & 0x7f800000)
+    uint32_t bits = bit_cast<uint32_t>(f);
+    if(~bits & 0x7f800000)
     {
         // When the exponent bits are not all 1s, then the value is zero, normal,
         // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
@@ -140,9 +136,9 @@ constexpr uint16_t float_to_bf16_rtn_raw(float f)
         // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
         // incrementing it causes it to become an exponent of 0xFF and a mantissa
         // of 0x00, which is Inf, the next higher value to the unrounded value.
-        u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
+        bits += 0x7fff + ((bits >> 16) & 1); // Round to nearest, round to even
     }
-    else if(u.int32 & 0xffff)
+    else if(bits & 0xffff)
     {
         // When all of the exponent bits are 1, the value is Inf or NaN.
         // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
@@ -152,9 +148,9 @@ constexpr uint16_t float_to_bf16_rtn_raw(float f)
         // lower 16 bits of the mantissa are 1, we set the least significant bit
         // of the bfloat16 mantissa, in order to preserve signaling NaN in case
         // the bloat16's mantissa bits are all 0.
-        u.int32 |= 0x10000; // Preserve signaling NaN
+        bits |= 0x10000; // Preserve signaling NaN
     }
-    return uint16_t(u.int32 >> 16);
+    return uint16_t(bits >> 16);
 }
 
 CK_TILE_HOST
@@ -225,24 +221,16 @@ uint16_t float_to_bf16_rta_asm(float f)
 CK_TILE_HOST_DEVICE
 constexpr uint16_t float_to_bf16_truc_nan_raw(float f)
 {
-    union
-    {
-        float fp32;
-        uint32_t int32;
-    } u = {f};
-    return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
+    uint32_t bits = bit_cast<uint32_t>(f);
+    return static_cast<uint16_t>(bits >> 16) | (!(~bits & 0x7f800000) && (bits & 0xffff));
 }
 
 // Fast truncate instead of rounding, RTZ
 CK_TILE_HOST_DEVICE
 constexpr uint16_t float_to_bf16_truc_raw(float f)
 {
-    union
-    {
-        float fp32;
-        uint32_t int32;
-    } u = {f};
-    return uint16_t(u.int32 >> 16);
+    uint32_t bits = bit_cast<uint32_t>(f);
+    return static_cast<uint16_t>(bits >> 16);
 }
 
 template <bf16_rounding_mode rounding>
@@ -287,7 +275,7 @@ template <bf16_rounding_mode rounding =
               static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
 CK_TILE_HOST_DEVICE constexpr bfloat16_t float_to_bf16(float f, constant<rounding> = {})
 {
-#if defined(__gfx950__)
+#if CK_TILE_USE_LLVM_BUILTIN_BF16
     return static_cast<bfloat16_t>(f);
 #else
     return bit_cast<bfloat16_t>(float_to_bf16_raw(f, constant<rounding>{}));
diff --git a/include/ck_tile/core/numeric/float8.hpp b/include/ck_tile/core/numeric/float8.hpp
index 04ca950641..890e507894 100644
--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -399,9 +399,9 @@ CK_TILE_HOST_DEVICE DstT run_cast_to_f8(SrcT src, unsigned int rng = 0)
         }
         mantissa += (1u << SrcT_mant); // Add the implicit 1 into mantissa
     }
-    // The value is smaller than min f8 denormal and results in zero (the early exit also prevents
+    // The value is <= than min f8 denormal/2 and results in zero (the early exit also prevents
     // an undefined behavior of bit shifts >= type width).
-    if(exponent_diff > DstT_mant)
+    if(exponent_diff > DstT_mant + 1)
     {
         return is_fnuz ? 0 : (sign << (DstT_exp + DstT_mant));
     }
diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp
index f25b98f5a0..8b78990d08 100644
--- a/include/ck_tile/core/numeric/pk_fp4.hpp
+++ b/include/ck_tile/core/numeric/pk_fp4.hpp
@@ -23,7 +23,8 @@ using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
 using bf16x2_t = bfloat16_t __attribute__((ext_vector_type(2)));
 
-CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float x, float scale = 1.f);
+struct pk_float4_e2m1_t;
+CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t float_to_pk_fp4(const float& x, float scale = 1.f);
 
 // TODO: Add stochastic method
 struct pk_float4_e2m1_t
@@ -31,7 +32,7 @@ struct pk_float4_e2m1_t
     // TODO: Can we merge raw_type and type?
     using raw_type = uint8_t;
     using type     = raw_type;
-    raw_type data;
+    type data;
 
     CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t() : data{type{}} {}
     template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
@@ -39,12 +40,12 @@ struct pk_float4_e2m1_t
     {
     }
     CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init, float scale = 1.f)
-        : data{float_to_e2m1(init, scale)}
+        : data{float_to_pk_fp4(init, scale)}
     {
     }
     CK_TILE_HOST_DEVICE constexpr operator type() const { return data; }
-    CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; }
-    CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; }
+    CK_TILE_HOST_DEVICE constexpr type& get() { return data; }
+    CK_TILE_HOST_DEVICE constexpr type get() const { return data; }
 
     CK_TILE_HOST_DEVICE constexpr float to_float(float scale = 1.f) const;
     CK_TILE_HOST_DEVICE constexpr fp32x2_t to_fp32x2(float scale = 1.f) const;
@@ -61,8 +62,19 @@ struct pk_float4_e2m1_t
     CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const { return to_bf16x2(); }
 
     template <index_t I>
-    CK_TILE_HOST_DEVICE constexpr raw_type unpack(number<I>) const;
-    CK_TILE_HOST_DEVICE constexpr static pk_float4_e2m1_t pack(const type x0, const type x1)
+    CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t unpack(number<I>) const
+    {
+        return _unpack(number<I>{});
+    }
+    CK_TILE_HOST_DEVICE constexpr static pk_float4_e2m1_t pack(const pk_float4_e2m1_t& x0,
+                                                               const pk_float4_e2m1_t& x1)
+    {
+        return _pack(x0.get(), x1.get());
+    }
+
+    template <index_t I>
+    CK_TILE_HOST_DEVICE constexpr type _unpack(number<I>) const;
+    CK_TILE_HOST_DEVICE constexpr static type _pack(const type x0, const type x1)
     {
         return (x1 << 4) | (x0 & 0b00001111);
     }
@@ -92,7 +104,7 @@ struct pk_float4_e2m1_t
 };
 
 using pk_fp4_t     = pk_float4_e2m1_t;
-using pk_fp4_raw_t = typename pk_fp4_t::raw_type;
+using pk_fp4_raw_t = typename pk_fp4_t::type;
 
 template <>
 struct numeric_traits<pk_fp4_t>
@@ -124,7 +136,7 @@ struct numeric<pk_fp4_t>
     CK_TILE_HOST_DEVICE static constexpr pk_fp4_t epsilon() { return binary_min_subnorm; }
     CK_TILE_HOST_DEVICE static constexpr pk_fp4_t round_error() { return binary_min_subnorm; }
     CK_TILE_HOST_DEVICE static constexpr pk_fp4_t zero() { return binary_zero; }
-    CK_TILE_HOST_DEVICE static constexpr fp8_t denorm_min() { return binary_min_subnorm; }
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t denorm_min() { return binary_min_subnorm; }
 
     CK_TILE_HOST_DEVICE static constexpr bool has_inf() { return false; }
     // N/A
@@ -136,7 +148,7 @@ struct numeric<pk_fp4_t>
 };
 
 template <index_t I>
-CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t pk_fp4_t::unpack(number<I>) const
+CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t pk_fp4_t::_unpack(number<I>) const
 {
     static_assert(I < 2, "Index is out of range.");
     if constexpr(I == 1)
@@ -202,7 +214,7 @@ CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_t::to_bf16(float scale) const
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_from_f4<bf16_t>(data, scale);
 #else
-    return bf16_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale))};
+    return bf16_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(_unpack(number<0>{}), scale))};
 #endif
 }
 
@@ -211,13 +223,13 @@ CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_t::to_bf16x2(float scale) const
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_from_f4<bf16x2_t>(data, scale);
 #else
-    return bf16x2_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale)),
-                    type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{}), scale))};
+    return bf16x2_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(_unpack(number<0>{}), scale)),
+                    type_convert<bf16_t>(convert_to_float<pk_fp4_t>(_unpack(number<1>{}), scale))};
 #endif
 }
 
-// TODO: make float_to_e2m1 generic so that we can convert from directrly.
-CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x, float scale)
+// TODO: make it generic so that we can convert from directrly.
+CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_mxfp4(float x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_to_f4(x, scale);
@@ -227,14 +239,20 @@ CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x, float scale)
 }
 CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x, float scale)
 {
-    return float_to_e2m1(x, scale);
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_to_f4(x, scale);
+#else
+    auto res = convert_to_type<pk_fp4_t>(x, scale);
+    return pk_fp4_t::_pack(res, res);
+#endif
 }
 CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_to_f4(x, scale);
 #else
-    return float_to_e2m1(type_convert<float>(x), scale);
+    auto res = float_to_mxfp4(type_convert<float>(x), scale);
+    return pk_fp4_t::_pack(res, res);
 #endif
 }
 CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x, float scale)
@@ -242,7 +260,8 @@ CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x, float sca
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_to_f4(x, scale);
 #else
-    return float_to_e2m1(type_convert<float>(x), scale);
+    auto res = float_to_mxfp4(type_convert<float>(x), scale);
+    return pk_fp4_t::_pack(res, res);
 #endif
 }
 CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x, float scale)
@@ -250,7 +269,7 @@ CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x, float
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_to_f4(x, scale);
 #else
-    return pk_fp4_t::pack(float_to_e2m1(x[0], scale), float_to_e2m1(x[1], scale));
+    return pk_fp4_t::_pack(float_to_mxfp4(x[0], scale), float_to_mxfp4(x[1], scale));
 #endif
 }
 CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x, float scale)
@@ -258,7 +277,7 @@ CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x, float
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_to_f4(x, scale);
 #else
-    return pk_fp4_t::pack(float_to_e2m1(x[0], scale), float_to_e2m1(x[1], scale));
+    return pk_fp4_t::_pack(float_to_mxfp4(x[0], scale), float_to_mxfp4(x[1], scale));
 #endif
 }
 CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x, float scale)
@@ -266,7 +285,7 @@ CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x, float
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_to_f4(x, scale);
 #else
-    return pk_fp4_t::pack(float_to_e2m1(x[0], scale), float_to_e2m1(x[1], scale));
+    return pk_fp4_t::_pack(float_to_mxfp4(x[0], scale), float_to_mxfp4(x[1], scale));
 #endif
 }
 
@@ -301,7 +320,7 @@ CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_from_f4<fp32_t>(data, scale);
 #else
-    return convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale);
+    return convert_to_float<pk_fp4_t>(_unpack(number<0>{}), scale);
 #endif
 }
 CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const
@@ -309,8 +328,8 @@ CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_from_f4<fp32x2_t>(data, scale);
 #else
-    return fp32x2_t{convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale),
-                    convert_to_float<pk_fp4_t>(unpack(number<1>{}), scale)};
+    return fp32x2_t{convert_to_float<pk_fp4_t>(_unpack(number<0>{}), scale),
+                    convert_to_float<pk_fp4_t>(_unpack(number<1>{}), scale)};
 #endif
 }
 
@@ -319,7 +338,7 @@ CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_from_f4<fp16_t>(data, scale);
 #else
-    return fp16_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale))};
+    return fp16_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(_unpack(number<0>{}), scale))};
 #endif
 }
 CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const
@@ -327,28 +346,29 @@ CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const
 #if CK_TILE_FP4_CVT_DEVICE
     return impl::_from_f4<fp16x2_t>(data, scale);
 #else
-    return fp16x2_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale)),
-                    type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{}), scale))};
+    return fp16x2_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(_unpack(number<0>{}), scale)),
+                    type_convert<fp16_t>(convert_to_float<pk_fp4_t>(_unpack(number<1>{}), scale))};
 #endif
 }
 #else
 CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const
 {
-    return e2m1_to_fp32_table[unpack(number<0>{})] * scale;
+    return e2m1_to_fp32_table[_unpack(number<0>{})] * scale;
 }
 CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const
 {
-    return fp32x2_t{e2m1_to_fp32_table[unpack(number<0>{})] * scale, e2m1_to_fp32_table[unpack(number<1>{}] * scale};
+    return fp32x2_t{e2m1_to_fp32_table[_unpack(number<0>{})] * scale, e2m1_to_fp32_table[_unpack(number<1>{}] * scale};
 }
 CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const
 {
-    return type_convert<float>(e2m1_to_fp16_table[unpack(number<0>{})]) * scale;
+    return type_convert<float>(e2m1_to_fp16_table[_unpack(number<0>{})]) * scale;
 }
 CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const
 {
     return fp16x2_t{
-        type_convert<fp16_t>(type_convert<float>(e2m1_to_fp16_table[unpack(number<0>{})]) * scale),
-        type_convert<fp16_t>(type_convert<float>(e2m1_to_fp16_table[unpack(number<1>{})]) * scale)};
+        type_convert<fp16_t>(type_convert<float>(e2m1_to_fp16_table[_unpack(number<0>{})]) * scale),
+        type_convert<fp16_t>(type_convert<float>(e2m1_to_fp16_table[_unpack(number<1>{})]) *
+                             scale)};
 }
 #endif
 
diff --git a/include/ck_tile/core/numeric/pk_int4.hpp b/include/ck_tile/core/numeric/pk_int4.hpp
index ad7956d32a..fc1caf13ff 100644
--- a/include/ck_tile/core/numeric/pk_int4.hpp
+++ b/include/ck_tile/core/numeric/pk_int4.hpp
@@ -125,7 +125,7 @@ CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t_signed_conversion(const pk_in
     float x_h = ((x_u8 & 0xf0) >> 4);
 
     x_l = x_l > 7 ? x_l - 16 : x_l;
-    x_h = x_l > 7 ? x_l - 16 : x_l;
+    x_h = x_h > 7 ? x_h - 16 : x_h;
 
 #ifdef CK_TILE_USE_PK4_LAYOUT_SHUFFLE
     fp32x2_t res = {x_h, x_l};
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index d1e770ef42..3b747dae84 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -875,10 +875,9 @@ struct buffer_view<address_space_enum::lds,
         if(is_valid_element)
         {
 #if defined(__gfx950__)
-            constexpr index_t t_per_x               = scalar_per_x_vector / scalar_per_t_vector;
-            constexpr address_space_enum addr_space = get_address_space();
-            return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x, addr_space>(
-                p_data_ + i + linear_offset);
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+            return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x>(p_data_ + i +
+                                                                          linear_offset);
 #else
             return X{numeric<remove_cvref_t<T>>::zero()};
 #endif
diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp
index 8b7541bf23..2e9ab0f5c6 100644
--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -26,6 +26,29 @@ CK_TILE_DEVICE auto load_tile(const TileWindow_& tile_window,
     return tile_window.load(number<i_access>{}, bool_constant<oob_conditional_check>{});
 }
 
+/**
+ * @brief Load tile with elementwise function
+ *
+ * @note This function is a modification of the existing load function.
+ *       It has been extended with two additional parameters: it takes a tuple as input
+ *       and an elementwise function. For each A = A0, A1… AN, the elementwise function
+ *       is additionally applied during a single read.
+ */
+template <typename TileWindow_,
+          typename ElementWise_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile_with_elementwise(const TileWindow_& tile_window,
+                                               ElementWise_ elementwise,
+                                               number<i_access>                     = {},
+                                               bool_constant<oob_conditional_check> = {})
+{
+    // TODO: Tile windows should works with unknow number of params
+    // Load element_wise API works only when the input typle is a tuple-tyupe
+    return tile_window[number<0>{}].load(
+        tile_window, elementwise, number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
+
 template <typename DistributedTensor_,
           typename TileWindow_,
           index_t i_access           = -1,
diff --git a/include/ck_tile/core/tensor/tensor_descriptor.hpp b/include/ck_tile/core/tensor/tensor_descriptor.hpp
index 3b372d45dd..8ee87ff9d0 100644
--- a/include/ck_tile/core/tensor/tensor_descriptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_descriptor.hpp
@@ -162,9 +162,15 @@ CK_TILE_HOST_DEVICE static void print(const tensor_descriptor<Transforms,
 {
     printf("tensor_descriptor{\n");
     // first print the tensor adaptor part of the descriptor using the base class print
-    print(static_cast<const typename decltype(descriptor)::Base&>(descriptor));
-    printf("element_space_size_: %ld,\n",
-           static_cast<long>(descriptor.get_element_space_size().value));
+    using Base = typename tensor_descriptor<Transforms,
+                                            LowerDimensionHiddenIdss,
+                                            UpperDimensionHiddenIdss,
+                                            TopDimensionHiddenIds,
+                                            ElementSpaceSize,
+                                            GuaranteedVectorLengths,
+                                            GuaranteedVectorStrides>::Base;
+    print(static_cast<const Base&>(descriptor));
+    printf("element_space_size_: %ld,\n", static_cast<long>(descriptor.get_element_space_size()));
     printf("guaranteed_vector_lengths: ");
     print(GuaranteedVectorLengths{});
     printf(",\nguaranteed_vector_strides: ");
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 6fa8f898e5..fb209ba827 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -455,7 +455,7 @@ CK_TILE_HOST_DEVICE constexpr auto make_tensor_view(DataType* __restrict__ p,
     auto buffer_view =
         make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());
 
-    return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
+    return tensor_view<decltype(buffer_view), decltype(desc), DstInMemOp>{buffer_view, desc};
 }
 
 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
diff --git a/include/ck_tile/core/tensor/tile_elementwise.hpp b/include/ck_tile/core/tensor/tile_elementwise.hpp
index 284efd5d70..d29afa2d98 100644
--- a/include/ck_tile/core/tensor/tile_elementwise.hpp
+++ b/include/ck_tile/core/tensor/tile_elementwise.hpp
@@ -231,7 +231,7 @@ CK_TILE_DEVICE auto cast_tile_pk_fp8_fp32(const InTensor& in_dstr_tensors)
 template <typename OutDataType, typename InTensor>
 CK_TILE_DEVICE auto cast_tile_pk_fp16_fp32(const InTensor& in_dstr_tensors)
 {
-#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__)
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
     // This API is designed to use the _pk_ serious of function
     constexpr auto in_tile_dstr = InTensor::get_tile_distribution();
 
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index f5ddcd278c..2db5d719c0 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -120,6 +120,116 @@ struct tile_window_with_static_distribution
         return dst_tensor;
     }
 
+    /**
+     * @brief Load tile with elementwise function
+     *
+     * @note Load tile with elementwise — during value loading, an
+     *       elementwise function is executed for each A0, A1, … AN.
+     *       The values A0, A1, … AN are read by the same thread. In this way, we
+     *       reduce the amount of information loaded into the registers.
+     *       The same thread, during vectorized reading, accesses the same set of
+     *       data from A0, A1, A2, … AN.
+     */
+    template <typename TileWindow_,
+              typename ElementWise_,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true>
+    CK_TILE_DEVICE auto load(const TileWindow_& tile_window,
+                             ElementWise_ elementwise,
+                             number<i_access_unsupport_>          = {},
+                             bool_constant<oob_conditional_check> = {}) const
+    {
+        constexpr auto tile_dstr = typename Base::TileDstr{};
+        auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
+        load(dst_tensor,
+             tile_window,
+             elementwise,
+             number<i_access_unsupport_>{},
+             bool_constant<oob_conditional_check>{});
+        return dst_tensor;
+    }
+
+    template <typename DistributedTensor,
+              typename TileWindow_,
+              typename ElementWise_,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true>
+    CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor,
+                             const TileWindow_& tile_window,
+                             ElementWise_ elementwise,
+                             number<i_access_unsupport_>          = {},
+                             bool_constant<oob_conditional_check> = {}) const
+    {
+
+        using Traits   = typename Base::Traits;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        constexpr auto tile_dstr   = typename Base::TileDstr{};
+        constexpr auto sizeOfTuple = TileWindow_::size();
+        //  loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord =
+                tile_window[number<0>{}].pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord =
+                tile_window[number<0>{}].pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+
+                // data index [y0, y1, ...]
+                constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
+
+                // read from bottom tensor
+                const auto idx_vec_value = generate_tuple(
+                    [&](auto jj) {
+                        return tile_window[number<jj>{}]
+                            .get_bottom_tensor_view()
+                            .template get_vectorized_elements<vector_t>(
+                                bottom_tensor_thread_coord,
+                                0,
+                                bool_constant<oob_conditional_check>{});
+                    },
+                    number<sizeOfTuple>{});
+
+                // write into distributed tensor
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
+                    constexpr auto idx_ys = generate_tuple(
+                        [&](auto jj) {
+                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                            : idx_ys_start[jj];
+                        },
+                        number<Base::NDimY>{});
+
+                    constexpr index_t d =
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;
+
+                    ck_tile::apply(
+                        [&](auto&&... t) {
+                            elementwise(dst_tensor.get_thread_buffer().template at<d>(),
+                                        t.template get_as<
+                                            typename Base::DataType>()[j / Traits::PackedSize]...);
+                        },
+                        idx_vec_value);
+                });
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
+                        idx_diff_ys);
+
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+                }
+            });
+        });
+    }
+
     template <typename DistributedTensor,
               index_t i_access_unsupport_ = -1,
               bool oob_conditional_check  = true>
@@ -292,7 +402,7 @@ struct tile_window_with_static_distribution
         const index_t m0_init_value =
             size_per_buf + size_per_wave * get_warp_id(/*ReturnSgpr=*/bool_constant<false>{});
         m0_set_with_memory(
-            __builtin_amdgcn_readfirstlane(m0_init_value)); // This should be wave independent
+            amd_wave_read_first_lane(m0_init_value)); // This should be wave independent
 
         using Traits = typename Base::Traits;
 
@@ -857,6 +967,39 @@ CK_TILE_DEVICE void move_tile_window(
     window.move(step);
 }
 
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          index_t NumCoord>
+CK_TILE_DEVICE void move_tile_window(
+    tuple<tile_window_with_static_distribution<TensorView_,
+                                               WindowLengths_,
+                                               StaticTileDistribution_,
+                                               NumCoord>>& window,
+    const typename tile_window_with_static_distribution<TensorView_,
+                                                        WindowLengths_,
+                                                        StaticTileDistribution_,
+                                                        NumCoord>::BottomTensorIndex& step)
+{
+    using T = tuple<tile_window_with_static_distribution<TensorView_,
+                                                         WindowLengths_,
+                                                         StaticTileDistribution_,
+                                                         NumCoord>>;
+
+    static constexpr auto N = T::size();
+    static_for<0, N, 1>{}([&](auto Is) { window[number<Is>{}].move(step); });
+}
+
+template <typename TileWindowWithStaticDistributionType,
+          typename StepType,
+          typename std::enable_if_t<
+              is_detected<is_tuple, TileWindowWithStaticDistributionType>::value>* = nullptr>
+CK_TILE_DEVICE void move_tile_window(TileWindowWithStaticDistributionType& window, StepType& step)
+{
+    static constexpr auto N = TileWindowWithStaticDistributionType::size();
+    static_for<0, N, 1>{}([&](auto Is) { window[number<Is>{}].move(step); });
+}
+
 /**
  * @brief This class provides description of tile windowed view on the device memory.
  *
@@ -887,6 +1030,58 @@ struct tile_window_with_static_lengths
         this->window_lengths_     = window_lengths;
         this->bottom_tensor_view_ = bottom_tensor_view;
     }
+
+    /**
+     * @brief Print tile window elements for debugging.
+     *
+     * @tparam DataType Element data type (e.g., fp16_t, float, bf8_t)
+     * @param start_i Starting row (inclusive)
+     * @param end_i   Ending row (exclusive)
+     * @param start_j Starting column (inclusive)
+     * @param end_j   Ending column (exclusive)
+     * @param label   Optional output label
+     *
+     * @note Tested on fp16. Custom types may need adjustments.
+     * @example tile_window.template print_tile_window_range<fp16_t>(0, 4, 0, 8, "A");
+     */
+    template <typename DataType>
+    CK_TILE_DEVICE void print_tile_window_range(index_t start_i,
+                                                index_t end_i,
+                                                index_t start_j,
+                                                index_t end_j,
+                                                const char* label = "") const
+    {
+        const auto& tensor_view  = this->get_bottom_tensor_view();
+        const auto window_origin = this->get_window_origin();
+
+        printf("%s Window Range [%d:%d, %d:%d] (origin: %d, %d):\n",
+               label,
+               start_i,
+               end_i - 1,
+               start_j,
+               end_j - 1,
+               window_origin[0],
+               window_origin[1]);
+
+        for(index_t i = start_i; i < end_i; i++)
+        {
+            for(index_t j = start_j; j < end_j; j++)
+            {
+                // Create coordinate for this element relative to window origin
+                auto coord =
+                    make_tensor_coordinate(tensor_view.get_tensor_descriptor(),
+                                           make_tuple(window_origin[0] + i, window_origin[1] + j));
+
+                // Get the element using thread buffer type directly
+                using ThreadBuf = thread_buffer<DataType, 2>;
+                auto buf        = tensor_view.template get_vectorized_elements<ThreadBuf>(coord, 0);
+                auto value      = buf.at(number<0>{}); // Extract first element from thread buffer
+                printf("  %s[%d,%d] = %f", label, i, j, static_cast<float>(value));
+            }
+            printf("\n");
+        }
+        printf("\n");
+    }
 };
 
 template <typename TensorView_, typename WindowLengths_>
diff --git a/include/ck_tile/core/utility/debug.hpp b/include/ck_tile/core/utility/debug.hpp
index 15f0718dc2..9f0f931bc8 100644
--- a/include/ck_tile/core/utility/debug.hpp
+++ b/include/ck_tile/core/utility/debug.hpp
@@ -153,4 +153,28 @@ struct CK_PRINTF_WARP0 : public CK_PRINTF<ConvertTo, FMT, PREFIX, SUFFIX>
             base_t::operator()(buf);
     }
 };
+
+/*
+ * RAII struct which inserts start/end markers into the generated assembly.
+ *
+ * Usage:
+ *   - Create an `AsmScopeMarker` object at the beginning of a scope or code block.
+ *   - Its constructor will emit a "CK_ASM_SCOPE_START" marker into the assembly.
+ *   - When the object goes out of scope (end of block, return, exception, etc.),
+ *     the destructor will emit a "CK_ASM_SCOPE_END" marker.
+ *
+ * Example:
+ *   {
+ *       [[maybe_unused]] AsmScopeMarker marker;   // Emits CK_ASM_SCOPE_START
+ *       // ... code you want to delimit in assembly ...
+ *   } // marker goes out of scope → Emits CK_ASM_SCOPE_END
+ *
+ */
+struct AsmScopeMarker
+{
+    // in some future version of clang we might be able to use string_view to customize
+    CK_TILE_HOST_DEVICE AsmScopeMarker() { asm volatile(";;# CK_ASM_SCOPE_START"); }
+    CK_TILE_HOST_DEVICE ~AsmScopeMarker() { asm volatile(";;# CK_ASM_SCOPE_END"); }
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/utility/philox_rand.hpp b/include/ck_tile/core/utility/philox_rand.hpp
index 87abf5cc18..52b1489543 100644
--- a/include/ck_tile/core/utility/philox_rand.hpp
+++ b/include/ck_tile/core/utility/philox_rand.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -55,7 +55,8 @@ class philox
 
     CK_TILE_HOST_DEVICE void get_random_8x8(uint8_t* out,
                                             const unsigned long long subsequence,
-                                            const index_t start_idx) const
+                                            const index_t idx0,
+                                            const index_t idx1) const
     {
         uint4 tmp_ph;
         tmp_ph = get_philox_4x32(subsequence);
@@ -66,13 +67,12 @@ class philox
         tmp[2]            = tmp_ph.z;
         tmp[3]            = tmp_ph.w;
         uint32_t* out_tmp = reinterpret_cast<uint32_t*>(&out[0]);
-        out_tmp[0]        = tmp[start_idx];
-        out_tmp[1]        = tmp[start_idx + 2];
+        out_tmp[0]        = tmp[idx0];
+        out_tmp[1]        = tmp[idx1];
     }
 
-    CK_TILE_HOST_DEVICE void get_random_4x8(uint8_t* out,
-                                            const unsigned long long subsequence,
-                                            const index_t start_idx) const
+    CK_TILE_HOST_DEVICE void
+    get_random_4x8(uint8_t* out, const unsigned long long subsequence, const index_t idx) const
     {
         uint4 tmp_ph;
         tmp_ph = get_philox_4x32(subsequence);
@@ -83,7 +83,7 @@ class philox
         tmp[2]            = tmp_ph.z;
         tmp[3]            = tmp_ph.w;
         uint32_t* out_tmp = reinterpret_cast<uint32_t*>(&out[0]);
-        out_tmp[0]        = tmp[start_idx];
+        out_tmp[0]        = tmp[idx];
     }
 
     private:
diff --git a/include/ck_tile/core/utility/random.hpp b/include/ck_tile/core/utility/random.hpp
index f7fbfad4dd..6a38ad3bde 100644
--- a/include/ck_tile/core/utility/random.hpp
+++ b/include/ck_tile/core/utility/random.hpp
@@ -24,7 +24,7 @@ struct prand_generator_t<float, seed_>
 {
     CK_TILE_HOST_DEVICE uint32_t operator()(int id, float val, uint32_t seed = seed_)
     {
-        uint32_t x         = *(reinterpret_cast<uint32_t*>(&val));
+        uint32_t x         = bit_cast<uint32_t>(val);
         uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
         drop_bits ^= x >> 16;
         drop_bits = ((drop_bits & 31) << 11) | (drop_bits >> 5);
@@ -43,7 +43,7 @@ struct prand_generator_t<half_t, seed_>
 {
     CK_TILE_HOST_DEVICE uint32_t operator()(int id, half_t val, uint32_t seed = seed_)
     {
-        uint16_t x         = *(reinterpret_cast<uint16_t*>(&val));
+        uint16_t x         = bit_cast<uint16_t>(val);
         uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
         drop_bits          = ((drop_bits & 31) << 11) | (drop_bits >> 5);
         drop_bits *= 0x7000149;
diff --git a/include/ck_tile/core/utility/transpose_vectors.hpp b/include/ck_tile/core/utility/transpose_vectors.hpp
index 497fd3b948..f0d7dae706 100644
--- a/include/ck_tile/core/utility/transpose_vectors.hpp
+++ b/include/ck_tile/core/utility/transpose_vectors.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -34,7 +34,13 @@ struct transpose_vectors
         constexpr auto I3 = number<3>{};
         constexpr auto I4 = number<4>{};
 
-        if constexpr(sizeof(S) == 2)
+        if constexpr(sizeof(S) == 4)
+        {
+            static_for<0, NY, 1>{}([&](auto iy) {
+                static_for<0, NX, 1>{}([&](auto ix) { vy_tuple(iy)(ix) = vx_tuple[ix][iy]; });
+            });
+        }
+        else if constexpr(sizeof(S) == 2)
         {
             static_assert((NX % 2 == 0 && NY % 2 == 0), "wrong!");
 
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 41f5200413..d815b1db40 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -16,8 +16,10 @@
 #include "ck_tile/host/host_tensor.hpp"
 #include "ck_tile/host/joinable_thread.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/host/permute_pk_int4.hpp"
 #include "ck_tile/host/ranges.hpp"
 #include "ck_tile/host/reference/reference_batched_dropout.hpp"
+#include "ck_tile/host/reference/reference_batched_dropout_randval.hpp"
 #include "ck_tile/host/reference/reference_batched_elementwise.hpp"
 #include "ck_tile/host/reference/reference_batched_gemm.hpp"
 #include "ck_tile/host/reference/reference_batched_masking.hpp"
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index e03881a1c7..817a46a8ea 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -67,7 +67,10 @@ struct FillUniformDistribution
                                                        : std::random_device{}());
                     std::uniform_real_distribution<float> dis(a_, b_);
                     std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
-                        return ck_tile::type_convert<T>(dis(gen));
+                        if constexpr(numeric_traits<T>::PackedSize == 2)
+                            return ck_tile::type_convert<T>(fp32x2_t{dis(gen), dis(gen)});
+                        else
+                            return ck_tile::type_convert<T>(dis(gen));
                     });
                 };
                 threads[it] = joinable_thread(thread_f);
@@ -77,8 +80,12 @@ struct FillUniformDistribution
         {
             std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
             std::uniform_real_distribution<float> dis(a_, b_);
-            std::generate(
-                first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
+            std::generate(first, last, [&dis, &gen]() {
+                if constexpr(numeric_traits<T>::PackedSize == 2)
+                    return ck_tile::type_convert<T>(fp32x2_t{dis(gen), dis(gen)});
+                else
+                    return ck_tile::type_convert<T>(dis(gen));
+            });
         }
     }
 
diff --git a/include/ck_tile/host/permute_pk_int4.hpp b/include/ck_tile/host/permute_pk_int4.hpp
new file mode 100644
index 0000000000..b770edddca
--- /dev/null
+++ b/include/ck_tile/host/permute_pk_int4.hpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c), Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "ck_tile/core/utility/bit_cast.hpp"
+namespace ck_tile {
+
+/**
+ * @brief Permute packed int4 vectors for device implementation compatibility
+ *
+ * This function transforms 4 pk_int4_t values from original layout to hardware-optimized layout:
+ * - Original layout (4 pk_int4_t): 0x76543210
+ * - Transformed layout (4 pk_int4_t): 0x75316420
+ *
+ * Each pk_int4_t contains two 4-bit values packed in the high and low nibbles of an int8_t
+ *
+ * Example:
+ * - Input:  0x76, 0x54, 0x32, 0x10
+ * - Output: 0x75, 0x31, 0x64, 0x20
+ *
+ * @note Input tensor length must be a multiple of 4
+ *
+ * This transformation is required before transferring B matrix data (of type pk_int4_t) to device.
+ * The device conversion functions (i4_to_half4, i4_to_bhalf4, amd_assembly_i4_to_fp8x8,
+ * amd_assembly_i4_to_bf8x8) require data in 0x75316420 order to correctly convert pk_int4_t to
+ * other numeric types.
+ */
+template <typename Tensor>
+void permute_vectors_i4x4_b(Tensor& tensor)
+{
+    auto tensor_row_buf = tensor.data();
+    for(size_t idx = 0; idx < tensor.size(); idx += 4)
+    {
+        int8_t input[8];
+
+        for(int k = 0; k < 4; k++)
+        {
+            int8_t i4x2      = bit_cast<int8_t>(tensor_row_buf[idx + k]);
+            input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+            input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+        }
+
+        // permute 0x76543210 => 0x75316420
+        {
+            int8_t hi   = input[2];
+            int8_t lo   = input[0];
+            int8_t i4x2 = (hi << 4) | lo;
+
+            tensor_row_buf[idx + 0] = bit_cast<pk_int4_t>(i4x2);
+        }
+
+        {
+            int8_t hi   = input[6];
+            int8_t lo   = input[4];
+            int8_t i4x2 = (hi << 4) | lo;
+
+            tensor_row_buf[idx + 1] = bit_cast<pk_int4_t>(i4x2);
+        }
+
+        {
+            int8_t hi   = input[3];
+            int8_t lo   = input[1];
+            int8_t i4x2 = (hi << 4) | lo;
+
+            tensor_row_buf[idx + 2] = bit_cast<pk_int4_t>(i4x2);
+        }
+
+        {
+            int8_t hi   = input[7];
+            int8_t lo   = input[5];
+            int8_t i4x2 = (hi << 4) | lo;
+
+            tensor_row_buf[idx + 3] = bit_cast<pk_int4_t>(i4x2);
+        }
+    }
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_batched_dropout_randval.hpp b/include/ck_tile/host/reference/reference_batched_dropout_randval.hpp
new file mode 100644
index 0000000000..ec6c6009b7
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_batched_dropout_randval.hpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename RandValOutputDataType>
+CK_TILE_HOST void
+reference_batched_dropout_randval(HostTensor<RandValOutputDataType>& randval_b_m_n,
+                                  index_t batch,
+                                  uint64_t drop_seed,
+                                  uint64_t drop_offset)
+{
+    const index_t nhead         = randval_b_m_n.mDesc.get_lengths()[0];
+    const index_t real_seqlen_q = randval_b_m_n.mDesc.get_lengths()[1];
+    const index_t real_seqlen_k = randval_b_m_n.mDesc.get_lengths()[2];
+
+    static_assert(std::is_same_v<RandValOutputDataType, uint8_t>);
+
+    // BlockDropout generates random numbers by 32x32 tiles. Even when warp gemm 16x16 is used, the
+    // order of values in the bigger 32x32 tile must be the same because fwd and bwd may use
+    // different warp gemms (16x16 or 32x32).
+    // To compute 32x32 tiles, WarpGemmMfmaF16F16F32M32N32K16SwizzleA is used. It is
+    // WarpGemmAttributeMfmaImplF16F16F32M32N32K8 with SFactor = 2 (swizzling factor).
+    // Matrix element to register mapping for WarpGemmAttributeMfmaImplF16F16F32M32N32K8:
+    // C i:  (8 * floor(GPR_num / 4) % 32) + 4 * floor(lane / 32) + (GPR_num % 4)
+    // C j: (lane % 32)
+    // With SFactor = 2 it becomes:
+    // C i: (16 * floor(GPR_num / 8) % 32) + 8 * floor(lane / 32) + (GPR_num % 8)
+    // C j: (lane % 32)
+    // See ck_tile/ops/fmha/block/block_dropout.hpp for more details.
+
+    // The number of Philox 4x32 results required to fill 32x32 tile of 8-bit values
+    constexpr index_t philox_per_tile = 64;
+    constexpr index_t warp_gemm_mn    = 32;
+
+    const index_t rows = integer_divide_ceil(real_seqlen_q, warp_gemm_mn);
+    const index_t cols = integer_divide_ceil(real_seqlen_k, warp_gemm_mn);
+
+    auto f = [&](index_t i_h, index_t row, index_t col) {
+        uint2 rowcol = make_uint2(row, col);
+        for(index_t lane = 0; lane < philox_per_tile; lane++)
+        {
+            const uint64_t ph_head_offset = drop_offset + (batch * nhead + i_h) * philox_per_tile;
+            const index_t ph_offset       = lane;
+            philox ph(drop_seed, ph_head_offset + ph_offset);
+
+            uint8_t random_uint8_t[16];
+            ph.get_random_16x8(random_uint8_t, reinterpret_cast<unsigned long long&>(rowcol));
+
+            for(auto r = 0; r < 16; r++)
+            {
+                index_t i = (16 * (r / 8) % 32) + 8 * (lane / 32) + (r % 8);
+                index_t j = (lane % 32);
+                index_t m = row * warp_gemm_mn + i;
+                index_t n = col * warp_gemm_mn + j;
+
+                if(m < real_seqlen_q && n < real_seqlen_k)
+                {
+                    randval_b_m_n(i_h, m, n) = random_uint8_t[r];
+                }
+            }
+        }
+    };
+
+    make_ParallelTensorFunctor(f, nhead, rows, cols)(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp
index 70ca44170e..90f68f7e2e 100644
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -50,7 +50,7 @@ CK_TILE_HOST void reference_gemm_quant(const HostTensor<ADataType>& a_m_k,
             if constexpr(std::is_same_v<ADataType, pk_int4_t>)
             {
                 const pk_int4_t pk_val  = a_element_op(a_m_k(m, k));
-                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
                 if(k % 2 == 1)
                     v_a = fp32_val.hi;
                 else
@@ -63,7 +63,7 @@ CK_TILE_HOST void reference_gemm_quant(const HostTensor<ADataType>& a_m_k,
             if constexpr(std::is_same_v<BDataType, pk_int4_t>)
             {
                 const pk_int4_t pk_val  = b_element_op(b_k_n(k, n));
-                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
                 if(k % 2 == 1)
                     v_b = fp32_val.hi;
                 else
@@ -115,6 +115,138 @@ CK_TILE_HOST void reference_gemm_quant(const HostTensor<ADataType>& a_m_k,
     std::cout << std::endl;
 }
 
+template <typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename BQDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename AElementOp   = ck_tile::identity,
+          typename BElementOp   = ck_tile::identity,
+          typename ACCElementOp = ck_tile::identity>
+CK_TILE_HOST void reference_gemm_rowcol_quant(const HostTensor<ADataType>& a_m_k,
+                                              const HostTensor<AQDataType>& aq_m_1,
+                                              const HostTensor<BDataType>& b_k_n,
+                                              const HostTensor<BQDataType>& bq_1_n,
+                                              HostTensor<CDataType>& c_m_n,
+                                              const AElementOp& a_element_op     = {},
+                                              const BElementOp& b_element_op     = {},
+                                              const ACCElementOp& acc_element_op = {})
+{
+    static_assert(std::is_same_v<ADataType, fp8_t> || std::is_same_v<ADataType, bf8_t>);
+    static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t>);
+    static_assert(std::is_same_v<AccDataType, float>);
+    static_assert(std::is_same_v<CDataType, float> || std::is_same_v<CDataType, ck_tile::half_t>);
+    static_assert(std::is_same_v<AQDataType, float> && std::is_same_v<BQDataType, float>);
+    const std::size_t M = a_m_k.get_length(0);
+    const std::size_t N = b_k_n.get_length(1);
+    const std::size_t K = a_m_k.get_length(1);
+
+    auto f_mn = [&](auto m, auto n) {
+        // Init accumulator
+        AccDataType v_acc = 0;
+        // Get row scale for A and column scale for B
+        float a_scale = aq_m_1(m, 0);
+        float b_scale = bq_1_n(0, n);
+
+        // Compute the dot product
+        for(std::size_t k = 0; k < K; ++k)
+        {
+            AccDataType v_a;
+            AccDataType v_b;
+
+            // Process A data
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                const pk_int4_t pk_val  = a_element_op(a_m_k(m, k));
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else
+            {
+                v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
+            }
+
+            // Process B data
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                const pk_int4_t pk_val  = b_element_op(b_k_n(k, n));
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+            }
+            else
+            {
+                v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
+            }
+
+            v_acc += v_a * v_b;
+        }
+
+        v_acc = v_acc * a_scale * b_scale;
+
+        c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
+    };
+
+    make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
+}
+
+template <typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename BQDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename AElementOp   = ck_tile::identity,
+          typename BElementOp   = ck_tile::identity,
+          typename ACCElementOp = ck_tile::identity>
+CK_TILE_HOST void reference_gemm_tensor_quant(const HostTensor<ADataType>& a_m_k,
+                                              const HostTensor<AQDataType>& aq_1_1,
+                                              const HostTensor<BDataType>& b_k_n,
+                                              const HostTensor<BQDataType>& bq_1_1,
+                                              HostTensor<CDataType>& c_m_n,
+                                              const AElementOp& a_element_op     = {},
+                                              const BElementOp& b_element_op     = {},
+                                              const ACCElementOp& acc_element_op = {})
+{
+    static_assert(std::is_same_v<ADataType, fp8_t> || std::is_same_v<ADataType, bf8_t>);
+    static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t>);
+    static_assert(std::is_same_v<AccDataType, float>);
+    static_assert(std::is_same_v<CDataType, float> || std::is_same_v<CDataType, ck_tile::half_t>);
+    static_assert(std::is_same_v<AQDataType, float> && std::is_same_v<BQDataType, float>);
+    const std::size_t M = a_m_k.get_length(0);
+    const std::size_t N = b_k_n.get_length(1);
+    const std::size_t K = a_m_k.get_length(1);
+
+    auto f_mn = [&](auto m, auto n) {
+        // Init accumulator
+        AccDataType v_acc = 0;
+        // Get scale for A and scale for B
+        const AccDataType a_scale = ck_tile::type_convert<AccDataType>(aq_1_1(0, 0));
+        const AccDataType b_scale = ck_tile::type_convert<AccDataType>(bq_1_1(0, 0));
+
+        // Compute the dot product
+        for(std::size_t k = 0; k < K; ++k)
+        {
+            AccDataType v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
+            AccDataType v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
+
+            v_acc += v_a * v_b;
+        }
+
+        v_acc = v_acc * a_scale * b_scale;
+
+        c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
+    };
+
+    make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
+}
+
 template <typename ADataType,
           typename BDataType,
           typename AccDataType,
@@ -175,6 +307,81 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
     make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
 }
 
+template <typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDElementOp,
+          typename ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>,
+          typename BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>,
+          typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
+CK_TILE_HOST void
+reference_gemm_multiple_abd(const std::array<HostTensor<ADataType>, AsDataType::size()>& as_m_k,
+                            const std::array<HostTensor<BDataType>, BsDataType::size()>& bs_k_n,
+                            const std::array<HostTensor<DDataType>, DsDataType::size()>& ds_m_n,
+                            HostTensor<ADataType>& a_m_k,
+                            HostTensor<BDataType>& b_k_n,
+                            HostTensor<CDataType>& c_m_n,
+                            const AElementOp& a_element_op    = {},
+                            const BElementOp& b_element_op    = {},
+                            const CDElementOp& acc_element_op = {})
+{
+    const std::size_t M = a_m_k.get_length(0);
+    const std::size_t N = b_k_n.get_length(1);
+    const std::size_t K = a_m_k.get_length(1);
+
+    auto as_m_k_tuple =
+        generate_tie([&](auto idx) -> auto& { return as_m_k[idx]; }, number<AsDataType::size()>{});
+
+    auto bs_k_n_tuple =
+        generate_tie([&](auto idx) -> auto& { return bs_k_n[idx]; }, number<BsDataType::size()>{});
+
+    auto ds_m_n_tuple =
+        generate_tie([&](auto idx) -> auto& { return ds_m_n[idx]; }, number<DsDataType::size()>{});
+
+    // Apply elementwise function to A
+    auto a_elementwise_fn = [&](auto i, auto j) {
+        ck_tile::apply([&](auto&&... t) { a_element_op(a_m_k(i, j), t(i, j)...); }, as_m_k_tuple);
+    };
+
+    make_ParallelTensorFunctor(a_elementwise_fn, M, K)(std::thread::hardware_concurrency());
+
+    // Apply elementwise function to B
+    auto b_elementwise_fn = [&](auto i, auto j) {
+        ck_tile::apply([&](auto&&... t) { b_element_op(b_k_n(i, j), t(i, j)...); }, bs_k_n_tuple);
+    };
+
+    make_ParallelTensorFunctor(b_elementwise_fn, K, N)(std::thread::hardware_concurrency());
+
+    auto f_mk_kn_mn = [&](auto m, auto n) {
+        AccDataType v_acc = 0;
+        for(std::size_t k = 0; k < K; ++k)
+        {
+            ADataType v_a = a_m_k(m, k);
+            BDataType v_b = b_k_n(k, n);
+            v_acc +=
+                ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
+        }
+
+        CDataType v_c = 0;
+
+        ck_tile::apply(
+            [&](auto&&... t) {
+                acc_element_op(v_c,
+                               ck_tile::type_convert<float>(v_acc),
+                               ck_tile::type_convert<float>(t(m, n))...);
+            },
+            ds_m_n_tuple);
+
+        c_m_n(m, n) = ck_tile::type_convert<CDataType>(v_c);
+    };
+
+    make_ParallelTensorFunctor(f_mk_kn_mn, M, N)(std::thread::hardware_concurrency());
+}
+
 template <typename ADataType,
           typename BDataType,
           typename DsDataType,
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
index 1768c802d5..6c0972e10a 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
@@ -9,5 +9,7 @@
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
index c7717f08cd..b6eac45285 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
@@ -95,7 +95,11 @@ struct AddRmsnorm2dRdquantFwd
         return dim3(integer_divide_ceil(hargs.m, Block_M));
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? Problem::BlockShape::template GetBlockSize<true>()
+                           : Problem::BlockShape::template GetBlockSize<false>();
+    }
 
     // clang-format off
     template <typename T> struct t2s;
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
index ecd4e81b22..052ee4ae62 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
@@ -92,13 +92,13 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
 
         static constexpr index_t Block_N = Problem::BlockShape::Block_N;
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
 
         using XTensorType = decltype(cast_tile<ComputeDataType>(load_tile(a_window)));
         auto square_sum   = block_reduce2d.template MakeYBlockTile<XTensorType>();
         set_tile(square_sum, reduce_square_sum_func.GetIdentityValue<ComputeDataType>());
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             const auto a = load_tile(a_window);
             const auto b = load_tile(b_window);
@@ -149,7 +149,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
         if constexpr(kSaveX)
             __syncthreads();
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto x = [&]() {
                 if constexpr(kSaveX)
@@ -226,7 +226,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
         }
         move_tile_window(gamma_window, {Block_N});
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto x = [&]() {
                 if constexpr(kSaveX)
diff --git a/include/ck_tile/ops/batched_transpose.hpp b/include/ck_tile/ops/batched_transpose.hpp
index ca0088c812..5822d7b91b 100644
--- a/include/ck_tile/ops/batched_transpose.hpp
+++ b/include/ck_tile/ops/batched_transpose.hpp
@@ -12,5 +12,7 @@
 #include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp"
 #include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
index b0f48f6c5b..c99571562d 100644
--- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
@@ -84,9 +84,9 @@ struct BatchedTransposeKernel
         static constexpr ck_tile::index_t VectorSizeOutput   = Problem::VectorSizeOutput;
         static constexpr ck_tile::index_t VectorStrideOutput = 1;
 
-        const auto iM     = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
-        const auto iN     = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
-        const auto offset = __builtin_amdgcn_readfirstlane(blockIdx.z * kargs.height * kargs.width);
+        const auto iM     = amd_wave_read_first_lane(blockIdx.x * kMPerBlock);
+        const auto iN     = amd_wave_read_first_lane(blockIdx.y * kNPerBlock);
+        const auto offset = amd_wave_read_first_lane(blockIdx.z * kargs.height * kargs.width);
 
         const auto x_m_n = [&]() {
             const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
index 3b8d5a142e..9e2a67f940 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
@@ -21,12 +21,12 @@ struct BatchedTransposeCommonPolicy
 
         constexpr index_t kVectorSize = Problem::VectorSizeInput;
         static_assert((kLeadDimPerBlock * kVectorSize) % kBlockSize == 0, "");
-        using TileEncodingPattern = TileDistributionEncodingPattern2D<kBlockSize,
-                                                                      kSecondDimPerBlock,
-                                                                      kLeadDimPerBlock,
-                                                                      kVectorSize,
-                                                                      TileAccessPattern>;
-        return TileEncodingPattern::Make2DStaticTileDistribution();
+        using TileEncodingPattern = tile_distribution_encoding_pattern_2d<kBlockSize,
+                                                                          kSecondDimPerBlock,
+                                                                          kLeadDimPerBlock,
+                                                                          kVectorSize,
+                                                                          TileAccessPattern>;
+        return TileEncodingPattern::make_2d_static_tile_distribution();
     }
 };
 
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
index e6bbc709ea..137584c3e8 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
@@ -18,12 +18,12 @@ struct BatchedTransposePolicy : public BatchedTransposeCommonPolicy
         constexpr index_t NPerBlock   = Problem::kNPerBlock;
         constexpr index_t VecLoadSize = Problem::VectorSizeOutput;
 
-        using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                      MPerBlock,
-                                                                      NPerBlock,
-                                                                      VecLoadSize,
-                                                                      TileAccessPattern>;
-        return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
+        using TileEncodingPattern = tile_distribution_encoding_pattern_2d<BlockSize,
+                                                                          MPerBlock,
+                                                                          NPerBlock,
+                                                                          VecLoadSize,
+                                                                          TileAccessPattern>;
+        return TileEncodingPattern::make_shuffled_2d_static_tile_distribution();
     }
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/common.hpp b/include/ck_tile/ops/common.hpp
index 027e2fdd94..eff2d625b3 100644
--- a/include/ck_tile/ops/common.hpp
+++ b/include/ck_tile/ops/common.hpp
@@ -4,5 +4,7 @@
 #pragma once
 
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/common/generic_2d_block_shape.hpp b/include/ck_tile/ops/common/generic_2d_block_shape.hpp
index c0bfd93198..9c5d99efc3 100644
--- a/include/ck_tile/ops/common/generic_2d_block_shape.hpp
+++ b/include/ck_tile/ops/common/generic_2d_block_shape.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -35,43 +35,86 @@ namespace ck_tile {
         +-----------+-----------+-----------+-----------+-----------+
 // clang-format on
 */
-template <typename BlockTile_,    // block size, seq<M, N>
-          typename WarpPerBlock_, // num warps along seq<M, N>
-          typename WarpTile_,     // warp size, seq<M, N>
-          typename Vector_>       // contiguous pixels(vector size) along seq<M, N>)>
+template <typename BlockTile_,      // block size, seq<M, N>
+          typename ThreadPerBlock_, // num threads along seq<M, N>
+          typename Vector_>         // contiguous pixels(vector size) along seq<M, N>)>
 struct Generic2dBlockShape
 {
     // block size
-    static constexpr index_t Block_M = BlockTile_::at(number<0>{});
-    static constexpr index_t Block_N = BlockTile_::at(number<1>{});
-
-    // num warps along seq<M, N>, within each block
-    static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{});
-    static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{});
-
-    // warp size
-    static constexpr index_t Warp_M = WarpTile_::at(number<0>{});
-    static constexpr index_t Warp_N = WarpTile_::at(number<1>{});
-
-    static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0);
-    static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0);
-    // repeat of each thread along seq<M, N>
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+    static constexpr index_t Block_M          = BlockTile_::at(number<0>{});
+    static constexpr index_t Block_N          = BlockTile_::at(number<1>{});
+    static constexpr index_t ThreadPerBlock_M = ThreadPerBlock_::at(number<0>{});
+    static constexpr index_t ThreadPerBlock_N = ThreadPerBlock_::at(number<1>{});
 
     // vector size along seq<M, N>
     static constexpr index_t Vector_M = Vector_::at(number<0>{});
     static constexpr index_t Vector_N = Vector_::at(number<1>{});
 
+    // num warps along seq<M, N>, within each block
+    template <bool isHostWave32>
+    static constexpr index_t GetWarpPerBlock_M()
+    {
+        constexpr index_t warp_size    = isHostWave32 ? 32 : get_warp_size();
+        constexpr bool is_warp_per_row = ThreadPerBlock_N <= warp_size;
+        static_assert((ThreadPerBlock_M * ThreadPerBlock_N) % warp_size == 0);
+        constexpr index_t total_warps = (ThreadPerBlock_M * ThreadPerBlock_N) / warp_size;
+
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warp_size % ThreadPerBlock_N == 0);
+            return total_warps * (warp_size / ThreadPerBlock_N);
+        }
+        else
+        {
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N / warp_size);
+        }
+    };
+
+    // num of warps along n
+    template <bool isHostWave32>
+    static constexpr index_t GetWarpPerBlock_N()
+    {
+        constexpr index_t warp_size    = isHostWave32 ? 32 : get_warp_size();
+        constexpr bool is_warp_per_row = ThreadPerBlock_N <= warp_size;
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warp_size % ThreadPerBlock_N == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N % warp_size == 0);
+            return ThreadPerBlock_N / warp_size;
+        }
+    }
+
+    static constexpr index_t WarpPerBlock_M = GetWarpPerBlock_M<false>();
+    static constexpr index_t WarpPerBlock_N = GetWarpPerBlock_N<false>();
+
+    // warp size
+    static constexpr index_t BlockSize = WarpPerBlock_M * WarpPerBlock_N * get_warp_size();
+    static constexpr index_t Warp_M    = ThreadPerBlock_M / WarpPerBlock_M * Vector_M;
+    static constexpr index_t Warp_N    = ThreadPerBlock_N / WarpPerBlock_N * Vector_N;
     static_assert(Warp_M % Vector_M == 0);
     static_assert(Warp_N % Vector_N == 0);
-    // num of threads along seq<M, N>, within each warp
-    static constexpr index_t ThreadPerWarp_M  = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N  = Warp_N / Vector_N;
-    static constexpr index_t ThreadPerBlock_M = Block_M / Repeat_M / Vector_M;
-    static constexpr index_t ThreadPerBlock_N = Block_N / Repeat_N / Vector_N;
+    static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0);
+    static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0);
 
-    static constexpr index_t BlockSize = ThreadPerBlock_M * ThreadPerBlock_N;
+    // repeat of each thread along seq<M, N>
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    // num of threads along seq<M, N>, within each warp
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+
+    template <bool isHostWave32>
+    static constexpr index_t GetBlockSize()
+    {
+        constexpr index_t warp_size = isHostWave32 ? 32 : get_warp_size();
+        return GetWarpPerBlock_M<isHostWave32>() * GetWarpPerBlock_N<isHostWave32>() * warp_size;
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/common/load_interleaved_pk_type.hpp b/include/ck_tile/ops/common/load_interleaved_pk_type.hpp
new file mode 100644
index 0000000000..f8432b9da0
--- /dev/null
+++ b/include/ck_tile/ops/common/load_interleaved_pk_type.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+
+namespace ck_tile {
+
+template <class T>
+struct is_pk_int4 : std::false_type
+{
+};
+template <>
+struct is_pk_int4<pk_int4_t> : std::true_type
+{
+};
+
+template <typename ComputeDataType, index_t UnaryOpSize>
+struct InterleavedPKTypeLoader
+{
+    template <typename WarpWindow, typename WarpTile>
+    CK_TILE_DEVICE static void load_interleaved_pk_type(WarpTile& warp_tile,
+                                                        const WarpWindow& warp_window)
+    {
+        const element_wise::PassThroughPack8 elementwise_op{};
+
+        static_assert(WarpTile::get_thread_buffer_size() % UnaryOpSize == 0);
+        constexpr index_t thread_buffer_size = WarpTile::get_thread_buffer_size() / UnaryOpSize;
+        const auto in_dstr_tensors           = load_tile(warp_window);
+
+        using ComputeVectorType = ComputeDataType __attribute__((ext_vector_type(UnaryOpSize)));
+        static_for<0, thread_buffer_size, 1>{}([&](auto i) {
+            elementwise_op(warp_tile.get_thread_buffer().template get_as<ComputeVectorType>()(i),
+                           in_dstr_tensors.get_thread_buffer().template get_as<pk_int4x4_t>()[i]);
+        });
+    }
+};
+
+template <typename BDataType,
+          typename ComputeDataType,
+          index_t UnaryOpSize,
+          typename WarpTile,
+          typename WarpWindow>
+CK_TILE_DEVICE void load_int4_tile(WarpTile& dst, const WarpWindow& src)
+{
+    if constexpr(is_pk_int4<std::remove_cv_t<BDataType>>::value)
+    {
+        InterleavedPKTypeLoader<ComputeDataType, UnaryOpSize>::load_interleaved_pk_type(dst, src);
+    }
+    else
+    {
+        dst = load_tile(src);
+    }
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp
new file mode 100644
index 0000000000..5dbe6223c4
--- /dev/null
+++ b/include/ck_tile/ops/common/streamk_common.hpp
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+enum StreamKReductionStrategy : uint32_t
+{
+    Atomic    = 0u,
+    Reduction = 1u
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise.hpp b/include/ck_tile/ops/elementwise.hpp
index 4858245ec4..7f2303932e 100644
--- a/include/ck_tile/ops/elementwise.hpp
+++ b/include/ck_tile/ops/elementwise.hpp
@@ -10,5 +10,7 @@
 #include "ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp"
 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp b/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
index f9b1cf3352..bf56a36b1e 100644
--- a/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
@@ -82,6 +82,14 @@ struct Add
         y                  = type_convert<bf16_t>(y_tmp);
     }
 
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bf16_t>(bf16_t& y, const float& x0, const float& x1) const
+    {
+        const float y_tmp = x0 + x1;
+        y                 = type_convert<bf16_t>(y_tmp);
+    }
+
     template <>
     __host__ __device__ constexpr void
     operator()<int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
diff --git a/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp b/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
index 2ec9414f42..b1e5e01777 100644
--- a/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
+++ b/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
@@ -21,11 +21,15 @@ struct ElementWiseKernel
     using ElementWiseOperation = ck_tile::remove_cvref_t<typename Problem::ElementWiseOperation>;
 
     static constexpr index_t kBlockSize = Problem::BlockShape::kBlockSize;
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? kBlockSize / 2 : kBlockSize;
+    }
 
     template <typename... XDataType, typename Dims>
-    CK_TILE_DEVICE void operator()(Dims lens,
-                                   Dims input_strides,
-                                   Dims output_strides,
+    CK_TILE_DEVICE void operator()(const Dims lens,
+                                   const Dims input_strides,
+                                   const Dims output_strides,
                                    const tuple<XDataType...>& input_tensors,
                                    YDataType* p_y) const
     {
@@ -100,24 +104,8 @@ struct ElementWiseKernel
     template <typename... Ints>
     CK_TILE_HOST static bool IsSupportedArgument(const ck_tile::tuple<Ints...>& input_sizes)
     {
-        int total_elements  = 1;
-        const auto kVectorM = Problem_::BlockShape::kVectorM;
-
-        apply([&](auto&&... args) { ((total_elements *= args), ...); }, input_sizes);
-
-        if((total_elements % kVectorM) != 0)
-        {
-            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-            {
-                CK_TILE_ERROR("Conditions not met: total number of input elements (",
-                              total_elements,
-                              ") should be multiple of the vectorization size (",
-                              kVectorM,
-                              ")");
-            }
-            return false;
-        }
-
+        // when total elements % kVectorM != 0; should use Pad instead of unsupported
+        ignore = input_sizes;
         return true;
     }
 };
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index 2f8cef7afd..ea8ba4557e 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -4,15 +4,46 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include <cstdint>
 #include <type_traits>
 
+#define CONSTEXPR_LOOKUP_TABLE_FOR_BF16 1
+#define CONSTEXPR_LOOKUP_TABLE_FOR_FP8 0
+#define CONSTEXPR_LOOKUP_TABLE_FOR_BF8 0
+
 namespace ck_tile {
 namespace element_wise {
 
-// Fast int4x4 to fp16x8_t data type conversion based on paper
-// [Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production]
-// (https://arxiv.org/abs/2211.10017) and implementation:
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+// Generalized constexpr lookup table generator
+template <typename T, std::size_t N, typename F, std::size_t... Is>
+constexpr std::array<T, N> make_lookup_table_impl(F&& func, std::index_sequence<Is...>)
+{
+    return {func(Is)...};
+}
+
+template <typename T, std::size_t N, typename F>
+constexpr std::array<T, N> make_lookup_table(F&& func)
+{
+    return make_lookup_table_impl<T, N>(std::forward<F>(func), std::make_index_sequence<N>{});
+}
+
+/**
+ * @brief Fast int4x4 to fp16x8_t data type conversion based on paper
+ * "Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production"
+ * @see https://arxiv.org/abs/2211.10017
+ * @see
+ * https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+ *
+ * This function converts 4 4-bit integers into 4 fp16 values.
+ * @note `int q` contains 4 bytes, low 4 bits of each byte represent an int4.
+ * @note This function assumes pk_int4_t has a bias of 8, meaning 0b0000 is converted to fp16(-8)
+ * @note The output ordering differs from input ordering. For example, when input is 0x76543210,
+ *       the output sequence will be fp16(7, 3, 6, 2, 5, 1, 4, 0). Therefore, the input tensor
+ *       must be preprocessed with permute_vectors_i4x4_b on the host side before using this
+ * function.
+ *
+ * @see permute_vectors_i4x4_b
+ */
 CK_TILE_DEVICE fp16x4_t i4_to_half4(int q)
 {
     const int LO = 0x000f000f;
@@ -46,6 +77,18 @@ CK_TILE_DEVICE fp16x4_t i4_to_half4(int q)
     return res;
 }
 
+/**
+ * @brief This function dequantizes 4 int4 values into 4 fp16 values and applies scaling.
+ *
+ * @note `int q` contains 4 bytes, low 4 bits of each byte represent an int4.
+ * @note This function assumes pk_int4_t has a bias of 8, meaning 0b0000 is converted to fp16(-8)
+ * @note The output ordering differs from input ordering. For example, when input is 0x76543210,
+ *       the output sequence will be fp16(7, 3, 6, 2, 5, 1, 4, 0). Therefore, the input tensor
+ *       must be preprocessed with permute_vectors_i4x4_b on the host side before using this
+ * function.
+ *
+ * @see permute_vectors_i4x4_b
+ */
 CK_TILE_DEVICE fp16x4_t i4_to_half4_scale(int q, const fp16x2_t& scale)
 {
     const int LO = 0x000f000f;
@@ -81,8 +124,22 @@ CK_TILE_DEVICE fp16x4_t i4_to_half4_scale(int q, const fp16x2_t& scale)
     return res;
 }
 
+/**
+ * @brief This function converts 4 4-bit integers into 4 bf16 values.
+ *
+ * @note `int q` contains 4 bytes, low 4 bits of each byte represent an int4.
+ * @note This function assumes pk_int4_t has a bias of 8, meaning 0b0000 is converted to bf16(-8)
+ * @note The output ordering differs from input ordering. For example, when input is 0x76543210,
+ *       the output sequence will be bf16(7, 3, 6, 2, 5, 1, 4, 0). Therefore, the input tensor
+ *       must be preprocessed with permute_vectors_i4x4_b on the host side before using this
+ * function.
+ *
+ * @see permute_vectors_i4x4_b
+ */
 CK_TILE_DEVICE bf16x4_t i4_to_bhalf4(int q)
 {
+#if !CONSTEXPR_LOOKUP_TABLE_FOR_BF16
+    // This approach fails validation in GEMM tests.
     uint32_t i8s = (q & 0xf) | ((q & 0xf0) << 4) | ((q & 0xf00) << 8) | ((q & 0xf000) << 12);
 
     static constexpr uint32_t fp32_base = 0x4B000000;
@@ -108,40 +165,95 @@ CK_TILE_DEVICE bf16x4_t i4_to_bhalf4(int q)
         __byte_perm(fp32_intermediates_casted[3], fp32_intermediates_casted[2], 0x7632));
 
     return res;
+#else
+    // Lookup table for bf16_t values corresponding to int4 values -8 to 7
+    constexpr auto bf16_lookup_table = make_lookup_table<bf16_t, 16>(
+        [](int i) { return bit_cast<bf16_t>(float_to_bf16_rtn_raw(i - 8)); });
+
+    return bf16x4_t{bf16_lookup_table[(q >> 0) & 0xf],
+                    bf16_lookup_table[(q >> 16) & 0xf],
+                    bf16_lookup_table[(q >> 4) & 0xf],
+                    bf16_lookup_table[(q >> 20) & 0xf]};
+#endif
 }
 
+#if !CONSTEXPR_LOOKUP_TABLE_FOR_FP8
+/**
+ * @brief This function converts 8 packed 4-bit integers into 8 fp8 values.
+ *
+ * @note `int q` contains 4 bytes, each byte represents 2 int4.
+ * @note This function assumes pk_int4_t has a bias of 8, meaning 0b0000 is converted to fp8(-8)
+ * @note The output ordering differs from input ordering. For example, when input is 0x76543210,
+ *       the output sequence will be fp8(7, 3, 6, 2, 5, 1, 4, 0). Therefore, the input tensor
+ *       must be preprocessed with permute_vectors_i4x4_b on the host side before using this
+ * function.
+ *
+ * @see permute_vectors_i4x4_b
+ */
 CK_TILE_DEVICE fp8x8_t amd_assembly_i4_to_fp8x8(int a)
 {
-    uint32_t src = static_cast<uint32_t>(a), src_hi;
-    uint32_t fp8x4_lo, fp8x4_hi;
-    float tmp_0, tmp_1;
+#if CK_TILE_USE_OCP_FP8
+    // register values [3, 2, 1, 0]
+    static constexpr uint32_t reg0 = 0xcaccced0;
+    // register values [7, 6, 5, 4]
+    static constexpr uint32_t reg1 = 0xb8c0c4c8;
+    // register values [-1, -2, -3, -4]
+    static constexpr uint32_t reg2 = 0x44403800;
+    // register values [-5, -6, -7, -8]
+    static constexpr uint32_t reg3 = 0x4e4c4a48;
+#else
+    // register values [3, 2, 1, 0]
+    static constexpr uint32_t reg0 = 0xd2d4d6d8;
+    // register values [7, 6, 5, 4]
+    static constexpr uint32_t reg1 = 0xc0c8ccd0;
+    // register values [-1, -2, -3, -4]
+    static constexpr uint32_t reg2 = 0x4C484000;
+    // register values [-5, -6, -7, -8]
+    static constexpr uint32_t reg3 = 0x56545250;
+#endif
 
-    asm volatile("v_lshrrev_b32 %[v_hi_src], 4, %[v_src]\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_3\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_3\n"
-                 "v_cvt_pk_fp8_f32 %[v_dst_hi], %[v_tmp_1], %[v_tmp_0], op_sel:[0, 0, 1]\n"
+    uint32_t tmp_pos, tmp_neg, tmp_res_even, tmp_res_odd, final_sel;
 
-                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_2\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_2\n"
-                 "v_cvt_pk_fp8_f32 %[v_dst_hi], %[v_tmp_1], %[v_tmp_0]\n"
+    uint32_t dict_sel = a & 0x07070707;
+    uint32_t sign     = a >> 1;
+    asm volatile("v_and_or_b32 %0, %1, %2, %3"
+                 : "=v"(final_sel)
+                 : "v"(sign), "v"(0x04040404), "v"(0x03020100));
 
-                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_1\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_1\n"
-                 "v_cvt_pk_fp8_f32 %[v_dst_lo], %[v_tmp_1], %[v_tmp_0], op_sel:[0, 0, 1]\n"
+    tmp_pos      = __builtin_amdgcn_perm(reg1, reg0, dict_sel);
+    tmp_neg      = __builtin_amdgcn_perm(reg3, reg2, dict_sel);
+    tmp_res_even = __builtin_amdgcn_perm(tmp_neg, tmp_pos, final_sel);
 
-                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src]\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src]\n"
-                 "v_cvt_pk_fp8_f32 %[v_dst_lo], %[v_tmp_1], %[v_tmp_0]\n"
-                 : [v_tmp_0] "+v"(tmp_0),
-                   [v_tmp_1] "+v"(tmp_1),
-                   [v_hi_src] "+v"(src_hi),
-                   [v_dst_lo] "+v"(fp8x4_lo),
-                   [v_dst_hi] "+v"(fp8x4_hi),
-                   [v_src] "+v"(src)
-                 :);
+    a >>= 4;
+    dict_sel = a & 0x07070707;
+    sign     = a >> 1;
+    asm volatile("v_and_or_b32 %0, %1, %2, %3"
+                 : "=v"(final_sel)
+                 : "v"(sign), "v"(0x04040404), "v"(0x03020100));
 
-    return bit_cast<fp8x8_t>(((static_cast<uint64_t>(fp8x4_hi) << 32) | fp8x4_lo));
+    tmp_pos           = __builtin_amdgcn_perm(reg1, reg0, dict_sel);
+    tmp_neg           = __builtin_amdgcn_perm(reg3, reg2, dict_sel);
+    tmp_res_odd       = __builtin_amdgcn_perm(tmp_neg, tmp_pos, final_sel);
+    auto tmp_res_low  = __builtin_amdgcn_perm(tmp_res_odd, tmp_res_even, 0x06040200);
+    auto tmp_res_high = __builtin_amdgcn_perm(tmp_res_odd, tmp_res_even, 0x07050301);
+
+    return bit_cast<fp8x8_t>((static_cast<uint64_t>(tmp_res_high) << 32) | tmp_res_low);
 }
+#else
+CK_TILE_DEVICE fp8x4_t i4_to_fp8x4(int q)
+{
+    // The approach below can be used once this compiler issue is resolved:
+    // "constexpr bit cast involving type 'unsigned _BitInt(8)' is not yet supported"
+    // Lookup table for fp8_t values corresponding to int4 values -8 to 7
+    constexpr auto fp8_lookup_table = make_lookup_table<fp8_t, 16>(
+        [](int i) { return impl::cast_to_f8<float, fp8_t, true, false>(i - 8, 0); });
+
+    return fp8x4_t{fp8_lookup_table[(q >> 0) & 0xf],
+                   fp8_lookup_table[(q >> 16) & 0xf],
+                   fp8_lookup_table[(q >> 4) & 0xf],
+                   fp8_lookup_table[(q >> 20) & 0xf]};
+}
+#endif
 
 CK_TILE_DEVICE float amd_assembly_fp8_to_fp32(uint32_t src)
 {
@@ -157,38 +269,83 @@ CK_TILE_DEVICE float amd_assembly_bf8_to_fp32(uint32_t src)
     return res;
 }
 
-CK_TILE_DEVICE bf8x8_t amd_assembly_i4_to_bf8x8(int a)
+#if !CONSTEXPR_LOOKUP_TABLE_FOR_BF8
+/**
+ * @brief This function converts 8 packed 4-bit integers into 8 bf8 values.
+ *
+ * @note `int q` contains 4 bytes, each byte represents 2 int4.
+ * @note This function assumes pk_int4_t has a bias of 8, meaning 0b0000 is converted to bf8(-8)
+ * @note The output ordering differs from input ordering. For example, when input is 0x76543210,
+ *       the output sequence will be bf8(7, 3, 6, 2, 5, 1, 4, 0). Therefore, the input tensor
+ *       must be preprocessed with permute_vectors_i4x4_b on the host side before using this
+ * function.
+ *
+ * @see permute_vectors_i4x4_b
+ */
+CK_TILE_DEVICE bf8x8_t amd_assembly_i4_to_bf8x8(uint32_t a)
 {
-    uint32_t src = static_cast<uint32_t>(a), src_hi;
-    uint32_t bf8x4_lo, bf8x4_hi;
-    float tmp_0, tmp_1;
+#if CK_TILE_USE_OCP_FP8
+    // register values [3, 2, 1, 0]
+    static constexpr uint32_t reg0 = 0Xc5c6c7c8;
+    // register values [7, 6, 5, 4]
+    static constexpr uint32_t reg1 = 0Xbcc0c2c4;
+    // register values [11, 10, 9, 8]
+    static constexpr uint32_t reg2 = 0X42403c00;
+    // register values [15, 14, 13, 12]
+    static constexpr uint32_t reg3 = 0X47464544;
+#else
+    // register values [3, 2, 1, 0]
+    static constexpr uint32_t reg0 = 0Xc9cacbcc;
+    // register values [7, 6, 5, 4]
+    static constexpr uint32_t reg1 = 0Xc0c4c6c8;
+    // register values [11, 10, 9, 8]
+    static constexpr uint32_t reg2 = 0X46444000;
+    // register values [15, 14, 13, 12]
+    static constexpr uint32_t reg3 = 0X4b4a4948;
+#endif
 
-    asm volatile("v_lshrrev_b32 %[v_hi_src], 4, %[v_src]\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_3\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_3\n"
-                 "v_cvt_pk_bf8_f32 %[v_dst_hi], %[v_tmp_1], %[v_tmp_0], op_sel:[0, 0, 1]\n"
+    uint32_t tmp_pos, tmp_neg, tmp_res_even, tmp_res_odd, final_sel;
 
-                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_2\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_2\n"
-                 "v_cvt_pk_bf8_f32 %[v_dst_hi], %[v_tmp_1], %[v_tmp_0]\n"
+    uint32_t dict_sel = a & 0x07070707;
+    uint32_t sign     = a >> 1;
+    asm volatile("v_and_or_b32 %0, %1, %2, %3"
+                 : "=v"(final_sel)
+                 : "v"(sign), "v"(0x04040404), "v"(0x03020100));
 
-                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_1\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_1\n"
-                 "v_cvt_pk_bf8_f32 %[v_dst_lo], %[v_tmp_1], %[v_tmp_0], op_sel:[0, 0, 1]\n"
+    tmp_pos      = __builtin_amdgcn_perm(reg1, reg0, dict_sel);
+    tmp_neg      = __builtin_amdgcn_perm(reg3, reg2, dict_sel);
+    tmp_res_even = __builtin_amdgcn_perm(tmp_neg, tmp_pos, final_sel);
 
-                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src]\n"
-                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src]\n"
-                 "v_cvt_pk_bf8_f32 %[v_dst_lo], %[v_tmp_1], %[v_tmp_0]\n"
-                 : [v_tmp_0] "+v"(tmp_0),
-                   [v_tmp_1] "+v"(tmp_1),
-                   [v_hi_src] "+v"(src_hi),
-                   [v_dst_lo] "+v"(bf8x4_lo),
-                   [v_dst_hi] "+v"(bf8x4_hi),
-                   [v_src] "+v"(src)
-                 :);
+    a >>= 4;
+    dict_sel = a & 0x07070707;
+    sign     = a >> 1;
+    asm volatile("v_and_or_b32 %0, %1, %2, %3"
+                 : "=v"(final_sel)
+                 : "v"(sign), "v"(0x04040404), "v"(0x03020100));
 
-    return bit_cast<bf8x8_t>(((static_cast<uint64_t>(bf8x4_hi) << 32) | bf8x4_lo));
+    tmp_pos           = __builtin_amdgcn_perm(reg1, reg0, dict_sel);
+    tmp_neg           = __builtin_amdgcn_perm(reg3, reg2, dict_sel);
+    tmp_res_odd       = __builtin_amdgcn_perm(tmp_neg, tmp_pos, final_sel);
+    auto tmp_res_low  = __builtin_amdgcn_perm(tmp_res_odd, tmp_res_even, 0x06040200);
+    auto tmp_res_high = __builtin_amdgcn_perm(tmp_res_odd, tmp_res_even, 0x07050301);
+
+    return bit_cast<bf8x8_t>((static_cast<uint64_t>(tmp_res_high) << 32) | tmp_res_low);
 }
+#else
+CK_TILE_DEVICE bf8x4_t i4_to_bf8x4(int q)
+{
+    // The approach below can be used once this compiler issue is resolved:
+    // "constexpr bit cast involving type 'unsigned _BitInt(8)' is not yet supported"
+    // Lookup table for bf8_t values corresponding to int4 values -8 to 7
+    constexpr auto bf8_lookup_table = make_lookup_table<bf8_t, 16>(
+        [](int i) { return impl::cast_to_f8<float, bf8_t, true, false>(i - 8, 0); });
+
+    return bf8x4_t{bf8_lookup_table[(q >> 0) & 0xf],
+                   bf8_lookup_table[(q >> 16) & 0xf],
+                   bf8_lookup_table[(q >> 4) & 0xf],
+                   bf8_lookup_table[(q >> 20) & 0xf]};
+}
+#endif
 
 struct PassThroughPack8
 {
@@ -204,17 +361,27 @@ struct PassThroughPack8
     CK_TILE_HOST_DEVICE constexpr void operator()(bf16x8_t& y, const pk_int4x4_t& x) const
     {
         y.lo = i4_to_bhalf4(bit_cast<int>(x));
-        y.hi = i4_to_bhalf4(bit_cast<int>(x) >> 16);
+        y.hi = i4_to_bhalf4(bit_cast<int>(x) >> 8);
     }
 
     CK_TILE_HOST_DEVICE constexpr void operator()(fp8x8_t& y, const pk_int4x4_t& x) const
     {
-        y = amd_assembly_i4_to_fp8x8(bit_cast<int>(x));
+#if !CONSTEXPR_LOOKUP_TABLE_FOR_FP8
+        y = amd_assembly_i4_to_fp8x8(bit_cast<uint32_t>(x));
+#else
+        y.lo = i4_to_fp8x4(bit_cast<int>(x));
+        y.hi = i4_to_fp8x4(bit_cast<int>(x) >> 8);
+#endif
     }
 
     CK_TILE_HOST_DEVICE constexpr void operator()(bf8x8_t& y, const pk_int4x4_t& x) const
     {
-        y = amd_assembly_i4_to_bf8x8(bit_cast<int>(x));
+#if !CONSTEXPR_LOOKUP_TABLE_FOR_BF8
+        y = amd_assembly_i4_to_bf8x8(bit_cast<uint32_t>(x));
+#else
+        y.lo = i4_to_bf8x4(bit_cast<int>(x));
+        y.hi = i4_to_bf8x4(bit_cast<int>(x) >> 8);
+#endif
     }
     constexpr const static bool is_pack8_invocable = true;
 };
@@ -296,6 +463,23 @@ struct PassThrough
     }
 };
 
+struct AddScale
+{
+    template <typename E, typename... As>
+    CK_TILE_HOST_DEVICE constexpr void operator()(E& a, const As&... as) const
+    {
+        // Start with the base value c
+        float result = ck_tile::type_convert<float>(0.0f);
+
+        // Add by each D parameter using fold expression
+        ((result += ck_tile::type_convert<float>(as)), ...);
+
+        a = ck_tile::type_convert<E>(scale * result);
+    }
+
+    float scale = 1.0;
+};
+
 struct MultiDMultiply
 {
     template <typename E, typename C, typename... Ds>
@@ -326,7 +510,6 @@ struct MultiDAdd
     }
 };
 
-#if 0
 struct UnaryConvert
 {
     template <typename Y, typename X>
@@ -336,6 +519,7 @@ struct UnaryConvert
     }
 };
 
+#if 0
 struct ConvertBF16RTN
 {
     // convert to bf16 using round to nearest (rtn)
@@ -472,14 +656,14 @@ struct UnaryDivide
 
 struct UnarySquare
 {
-    template <typename T>
-    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
     {
-        static_assert(std::is_same_v<T, float> || std::is_same_v<T, ck_tile::fp16_t> ||
-                          std::is_same_v<T, double> || std::is_same_v<T, int32_t> ||
-                          std::is_same_v<T, int8_t>
+        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t> ||
+                          std::is_same_v<X, double> || std::is_same_v<X, int32_t> ||
+                          std::is_same_v<X, int8_t>
 #ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-                          || std::is_same_v<T, int4_t>
+                          || std::is_same_v<X, int4_t>
 #endif
                       ,
                       "Data type is not supported by this operation!");
diff --git a/include/ck_tile/ops/epilogue.hpp b/include/ck_tile/ops/epilogue.hpp
index 6cc0fa8540..ec5a8ef445 100644
--- a/include/ck_tile/ops/epilogue.hpp
+++ b/include/ck_tile/ops/epilogue.hpp
@@ -8,5 +8,7 @@
 #include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
 #include "ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 1d0a4c42f4..e0a39a5aea 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -6,11 +6,14 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+#include <optional>
+#include <type_traits>
 
 namespace ck_tile {
-
-template <typename ADataType_,
-          typename BDataType_,
+template <typename AsDataType_,
+          typename BsDataType_,
           typename DsDataType_,
           typename AccDataType_,
           typename ODataType_,
@@ -28,11 +31,12 @@ template <typename ADataType_,
           memory_operation_enum MemoryOperation_,
           index_t kNumWaveGroups_ = 1,
           bool FixedVectorSize_   = false,
-          index_t VectorSizeC_    = 1>
+          index_t VectorSizeC_    = 1,
+          bool TiledMMAPermuteN_  = false>
 struct CShuffleEpilogueProblem
 {
-    using ADataType                                        = remove_cvref_t<ADataType_>;
-    using BDataType                                        = remove_cvref_t<BDataType_>;
+    using AsDataType                                       = remove_cvref_t<AsDataType_>;
+    using BsDataType                                       = remove_cvref_t<BsDataType_>;
     using AccDataType                                      = remove_cvref_t<AccDataType_>;
     using ODataType                                        = remove_cvref_t<ODataType_>;
     using DsDataType                                       = remove_cvref_t<DsDataType_>;
@@ -51,6 +55,7 @@ struct CShuffleEpilogueProblem
     static constexpr memory_operation_enum MemoryOperation = MemoryOperation_;
     static constexpr bool FixedVectorSize                  = FixedVectorSize_;
     static constexpr index_t VectorSizeC                   = VectorSizeC_;
+    static constexpr bool TiledMMAPermuteN                 = TiledMMAPermuteN_;
     static constexpr index_t kNumWaveGroups                = kNumWaveGroups_;
     static constexpr index_t NumDTensor                    = DsDataType::size();
 
@@ -62,12 +67,27 @@ template <typename Problem_, typename Policy_ = void>
 struct CShuffleEpilogue
 {
     using Problem     = remove_cvref_t<Problem_>;
-    using ADataType   = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType   = remove_cvref_t<typename Problem::BDataType>;
+    using AsDataType  = remove_cvref_t<typename Problem::AsDataType>;
+    using BsDataType  = remove_cvref_t<typename Problem::BsDataType>;
     using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
     using ODataType   = remove_cvref_t<typename Problem::ODataType>;
     using DsDataType  = remove_cvref_t<typename Problem::DsDataType>;
     using DsLayout    = remove_cvref_t<typename Problem::DsLayout>;
+
+    static constexpr bool ADataTypeIsTuple = is_detected<is_tuple, AsDataType>::value;
+    static constexpr bool BDataTypeIsTuple = is_detected<is_tuple, BsDataType>::value;
+
+    using AsDataTypeTuple = std::conditional_t<ADataTypeIsTuple,
+                                               remove_cvref_t<AsDataType>,
+                                               remove_cvref_t<tuple<AsDataType>>>;
+
+    using BsDataTypeTuple = std::conditional_t<BDataTypeIsTuple,
+                                               remove_cvref_t<BsDataType>,
+                                               remove_cvref_t<tuple<BsDataType>>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<number<0>{}, AsDataTypeTuple>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<number<0>{}, BsDataTypeTuple>>;
+
     using ATypeToUse =
         std::conditional_t<std::is_same_v<ADataType, pk_int4_t>, BDataType, ADataType>;
     // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
@@ -86,10 +106,13 @@ struct CShuffleEpilogue
     static constexpr index_t KPerXdl                       = Problem::KPerXdl;
     static constexpr index_t isCTransposed                 = Problem::isCTransposed;
     static constexpr bool FixedVectorSize                  = Problem::FixedVectorSize;
+    static constexpr bool TiledMMAPermuteN                 = Problem::TiledMMAPermuteN;
     static constexpr index_t VectorSizeC                   = Problem::VectorSizeC;
     static constexpr index_t MPerIteration                 = MPerXdl * MWave;
     static constexpr index_t NPerIteration                 = NPerXdl * NWave;
     static constexpr index_t NumDTensor                    = Problem::NumDTensor;
+    static constexpr index_t MRepeat                       = kMPerBlock / (MPerXdl * MWave);
+    static constexpr index_t NRepeat                       = kNPerBlock / (NPerXdl * NWave);
 
     static_assert(NumDTensor == DsLayout::size(),
                   "The size of DsDataType and DsLayout should be the same");
@@ -210,8 +233,12 @@ struct CShuffleEpilogue
                                   KPerXdl,
                                   isCTransposed>;
 
-    using CWarpDstr   = typename WG::CWarpDstr;
-    using CWarpTensor = typename WG::CWarpTensor;
+    using CWarpDstr         = typename WG::CWarpDstr;
+    using CWarpTensor       = typename WG::CWarpTensor;
+    using CWarpDstrEncoding = typename WG::CWarpDstrEncoding;
+    using SFC               = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
+                                                  sequence<0, 1>,
+                                                  sequence<MPerIterationShuffle, NPerIterationShuffle>>;
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLdsBlockDescriptor()
@@ -257,11 +284,296 @@ struct CShuffleEpilogue
         return MPerIterationShuffle * NPerIterationShuffle * sizeof(ODataType);
     }
 
-    template <typename ODramWindow, typename OAccTile, typename DsDramWindows>
+    template <index_t iAccess, typename LdsTile, typename ScaleM, typename ScaleN>
+    CK_TILE_DEVICE void
+    scale_tile(LdsTile& lds_tile, ScaleM& scale_m_window, ScaleN& scale_n_window)
+    {
+        // Check if scales are EmptyScale first (no scaling needed)
+        if constexpr(std::is_same_v<ScaleM, EmptyScale> && std::is_same_v<ScaleN, EmptyScale>)
+        {
+            // No scaling needed - this is a no-op
+        }
+        // Check if scales are scalar AccDataType
+        else if constexpr(std::is_same_v<ScaleM, AccDataType> &&
+                          std::is_same_v<ScaleN, AccDataType>)
+        {
+            // Handle scalar scales
+            const AccDataType scale_m = scale_m_window;
+            const AccDataType scale_n = scale_n_window;
+            tile_elementwise_inout([&](auto& element) { element = element * scale_m * scale_n; },
+                                   lds_tile);
+        }
+        // Otherwise, assume they are tile windows that can be loaded
+        else
+        {
+            // Load tiles
+            const auto scale_m_tile = load_tile(scale_m_window);
+            const auto scale_n_tile = load_tile(scale_n_window);
+
+            // Compute element-wise product in-place i.e. lds_tile = lds_tile * scale_m * scale_n
+            tile_elementwise_inout(
+                element_wise::MultiDMultiply{}, lds_tile, lds_tile, scale_m_tile, scale_n_tile);
+
+            // Move scale windows
+            constexpr index_t num_access = SFC::get_num_of_access();
+            if constexpr(iAccess != num_access - 1)
+            {
+                constexpr auto step = SFC::get_forward_step(number<iAccess>{});
+
+                move_tile_window(scale_m_window, {step.at(number<0>{}), step.at(number<1>{})});
+                move_tile_window(scale_n_window, {step.at(number<0>{}), step.at(number<1>{})});
+            }
+        }
+    }
+
+    template <index_t iAccess, typename OAccTile, typename LdsTile>
+    CK_TILE_DEVICE void slice_acc_tile(const OAccTile& o_acc_tile, LdsTile& lds_tile)
+    {
+        constexpr auto idx_y_start = SFC::get_index(number<iAccess>{});
+
+        constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (MPerIterationShuffle)>{};
+        constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (NPerIterationShuffle)>{};
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        lds_tile.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
+            merge_sequences(
+                sequence<mIter * NumMXdlPerWavePerShuffle, nIter * NumNXdlPerWavePerShuffle>{},
+                c_warp_y_index_zeros),
+            merge_sequences(sequence<NumMXdlPerWavePerShuffle, NumNXdlPerWavePerShuffle>{},
+                            c_warp_y_lengths));
+    }
+
+    template <typename LdsTile, typename InLdsWindow>
+    CK_TILE_DEVICE void cast_lds_tile(LdsTile& lds_tile, InLdsWindow& in_lds_window)
+    {
+        const auto c_warptile_in_tensor_casted = cast_tile<ODataType>(lds_tile);
+
+        store_tile(in_lds_window, c_warptile_in_tensor_casted);
+    }
+
+    template <typename DramWindows, typename COutTensor>
+    CK_TILE_DEVICE void apply_d_tensors(DramWindows& d_dram_windows, COutTensor& c_out_tensor)
+    {
+        const auto ds_tensor = generate_tuple(
+            [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number<NumDTensor>{});
+
+        const auto c_ds_tiles = concat_tuple_of_reference(
+            tie(c_out_tensor, c_out_tensor),
+            generate_tie([&](auto idx) -> const auto& { return ds_tensor[idx]; },
+                         number<NumDTensor>{}));
+
+        tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
+    }
+
+    template <typename OutDramWindow, typename COutTensor>
+    CK_TILE_DEVICE void store_to_dram(OutDramWindow& out_dram_window,
+                                      const COutTensor& c_out_tensor)
+    {
+        if constexpr(MemoryOperation == memory_operation_enum::set)
+        {
+            store_tile(out_dram_window, c_out_tensor);
+        }
+        else
+        {
+            update_tile(out_dram_window, c_out_tensor);
+        }
+    }
+
+    /**
+     * @brief Move both the output and D tensors windows for the next access.
+     */
+    template <index_t iAccess, typename OutDramWindow, typename DDramWindows>
+    CK_TILE_DEVICE void move_windows(OutDramWindow& out_dram_window, DDramWindows& d_dram_windows)
+    {
+        constexpr index_t num_access = SFC::get_num_of_access();
+        if constexpr(iAccess != num_access - 1)
+        {
+            constexpr auto step = SFC::get_forward_step(number<iAccess>{});
+
+            // move the output dram window
+            move_tile_window(out_dram_window, {step.at(number<0>{}), step.at(number<1>{})});
+
+            // move windows for each of the D matrices (inputs for element-wise)
+            static_for<0, NumDTensor, 1>{}([&](auto idx) {
+                move_tile_window(d_dram_windows[idx], {step.at(number<0>{}), step.at(number<1>{})});
+            });
+        }
+    }
+
+    // TODO: Check if there would be nicer ways to overload rather than with EmptyScale or nullptr_t
+    struct EmptyScale
+    {
+    };
+
+    template <typename, typename = void>
+    struct ScaleDataType
+    {
+        using DataType = float;
+    };
+
+    template <typename T>
+    struct ScaleDataType<T, std::void_t<typename T::DataType>>
+    {
+        using DataType = typename T::DataType;
+    };
+
+    template <typename ODramWindow,
+              typename OAccTile,
+              typename DsDramWindows,
+              typename ScaleM                         = EmptyScale,
+              typename ScaleN                         = EmptyScale,
+              int EnablePermuateN_                    = TiledMMAPermuteN,
+              std::enable_if_t<EnablePermuateN_, int> = 0>
     CK_TILE_DEVICE auto operator()(ODramWindow& out_dram_window,
                                    const OAccTile& o_acc_tile,
                                    const DsDramWindows& ds_dram_windows,
-                                   void* p_smem)
+                                   void* /*p_smem*/,
+                                   const ScaleM& scale_m = {},
+                                   const ScaleN& scale_n = {})
+    {
+        constexpr int kM0 = MWave;
+        constexpr int kM2 = 4;
+        constexpr int kM1 = MPerXdl / kM2;
+
+        constexpr int kN0 = NWave;
+        constexpr int kN1 = NPerXdl;
+        constexpr int kN2 = NRepeat;
+
+        using IntrThreadShuffleEncode =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<kM0, kM1, kM2>, sequence<kN0, kN1, kN2>>,
+                                       tuple<sequence<1, 2>, sequence<1, 2>>,
+                                       tuple<sequence<0, 0>, sequence<1, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<2, 2>>;
+        constexpr auto dram_tile_distribution =
+            make_static_tile_distribution(IntrThreadShuffleEncode{});
+
+        auto d_dram_windows = generate_tuple(
+            [&](auto idx) {
+                return make_tile_window(ds_dram_windows[idx], dram_tile_distribution);
+            },
+            number<NumDTensor>{});
+
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        auto shuffle_acc  = make_static_distributed_tensor<AccDataType>(dram_tile_distribution);
+        auto c_out_tensor = make_static_distributed_tensor<ODataType>(dram_tile_distribution);
+
+        // Optional scales (must share the same distribution to match per-thread indexing)
+        constexpr bool has_scales =
+            !std::is_same<ScaleM, EmptyScale>::value && !std::is_same<ScaleN, EmptyScale>::value;
+        constexpr bool has_scalar_scales =
+            std::is_same_v<ScaleM, AccDataType> && std::is_same_v<ScaleN, AccDataType>;
+
+        // Tiles to hold row/col scales when present
+        using SMType = typename ScaleDataType<ScaleM>::DataType;
+        using SNType = typename ScaleDataType<ScaleN>::DataType;
+
+        auto sm_tile = make_static_distributed_tensor<SMType>(dram_tile_distribution);
+        auto sn_tile = make_static_distributed_tensor<SNType>(dram_tile_distribution);
+
+        // Build windows only if non-scalar scales are provided
+        auto scale_m_window = [&]() {
+            if constexpr(has_scales && !has_scalar_scales)
+            {
+                return make_tile_window(scale_m, dram_tile_distribution);
+            }
+            else
+            {
+                return EmptyScale{};
+            }
+        }();
+        auto scale_n_window = [&]() {
+            if constexpr(has_scales && !has_scalar_scales)
+            {
+                return make_tile_window(scale_n, dram_tile_distribution);
+            }
+            else
+            {
+                return EmptyScale{};
+            }
+        }();
+
+        static_for<0, MRepeat, 1>{}([&](auto mIter) {
+            // Slice accumulators for this M repeat into the permuted layout
+            shuffle_acc.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
+                merge_sequences(sequence<mIter, 0>{}, c_warp_y_index_zeros),
+                merge_sequences(sequence<1, NRepeat>{}, c_warp_y_lengths));
+
+            // If non-scalar scales provided, load them with identical distribution
+            if constexpr(has_scales && !has_scalar_scales)
+            {
+                sm_tile = load_tile(scale_m_window); // row scales in permuted layout
+                sn_tile = load_tile(scale_n_window); // col scales in permuted layout
+            }
+
+            // Pack 4 “rows per lane” as you already do
+            static_for<0, NRepeat, 1>{}([&](auto n_idx) {
+                // source indices in shuffle_acc: (n_idx * product(Y) + row)
+                const index_t base = n_idx * c_warp_y_lengths.product();
+
+                // local lambda to fuse scale (if present) and convert
+                auto emit = [&](index_t out_idx, index_t src_row) {
+                    AccDataType v = shuffle_acc.get_thread_buffer()[base + src_row];
+
+                    if constexpr(has_scalar_scales)
+                    {
+                        v = static_cast<AccDataType>(v * scale_m * scale_n);
+                    }
+                    else if constexpr(has_scales && !has_scalar_scales)
+                    {
+                        // same linear index mapping on the permuted distribution
+                        const auto s_m = static_cast<float>(sm_tile.get_thread_buffer()[out_idx]);
+                        const auto s_n = static_cast<float>(sn_tile.get_thread_buffer()[out_idx]);
+                        v              = static_cast<AccDataType>(v * s_m * s_n);
+                    }
+
+                    c_out_tensor.get_thread_buffer()[out_idx] = type_convert<ODataType>(v);
+                };
+
+                // Your current packing pattern (rows 0..3, spaced by NRepeat)
+                emit(n_idx + 0 * NRepeat, 0);
+                emit(n_idx + 1 * NRepeat, 1);
+                emit(n_idx + 2 * NRepeat, 2);
+                emit(n_idx + 3 * NRepeat, 3);
+            });
+
+            // store/update
+            if constexpr(MemoryOperation == memory_operation_enum::set)
+            {
+                store_tile(out_dram_window, c_out_tensor);
+            }
+            else
+            {
+                update_tile(out_dram_window, c_out_tensor);
+            }
+
+            // advance output (and any D-tensors) by one MPerXdl*MWave chunk
+            move_tile_window(out_dram_window, {number<MPerXdl * MWave>{}, number<0>{}});
+            static_for<0, NumDTensor, 1>{}([&](auto idx) {
+                move_tile_window(d_dram_windows[idx], {number<MPerXdl * MWave>{}, number<0>{}});
+            });
+        });
+    }
+
+    template <typename ODramWindow,
+              typename OAccTile,
+              typename DsDramWindows,
+              typename ScaleM                          = EmptyScale,
+              typename ScaleN                          = EmptyScale,
+              int EnablePermuateN_                     = TiledMMAPermuteN,
+              std::enable_if_t<!EnablePermuateN_, int> = 0>
+    CK_TILE_DEVICE auto operator()(ODramWindow& out_dram_window,
+                                   const OAccTile& o_acc_tile,
+                                   const DsDramWindows& ds_dram_windows,
+                                   void* p_smem,
+                                   const ScaleM& scale_m = {},
+                                   const ScaleN& scale_n = {})
     {
         constexpr auto LdsTileDistr = make_static_tile_distribution(MakeLdsDistributionEncode());
 
@@ -282,22 +594,20 @@ struct CShuffleEpilogue
             make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
             {0, 0});
 
-        using SFC                    = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
-                                                           sequence<0, 1>,
-                                                           sequence<MPerIterationShuffle, NPerIterationShuffle>>;
         constexpr index_t num_access = SFC::get_num_of_access();
 
         static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
                       "Currently, the CShuffle Epilogue only supports the Row Major Output layout");
 
         using TileEncodingPattern =
-            TileDistributionEncodingPattern2D<kBlockSize,
-                                              MPerIterationShuffle,
-                                              NPerIterationShuffle,
-                                              GetVectorSizeC(),
-                                              tile_distribution_pattern::thread_raked,
-                                              Problem::kNumWaveGroups>;
-        constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution();
+            tile_distribution_encoding_pattern_2d<kBlockSize,
+                                                  MPerIterationShuffle,
+                                                  NPerIterationShuffle,
+                                                  GetVectorSizeC(),
+                                                  tile_distribution_pattern::thread_raked,
+                                                  Problem::kNumWaveGroups>;
+        constexpr auto dram_tile_distribution =
+            TileEncodingPattern::make_2d_static_tile_distribution();
 
         auto d_dram_windows = generate_tuple(
             [&](auto idx) {
@@ -305,60 +615,56 @@ struct CShuffleEpilogue
             },
             number<NumDTensor>{});
 
-        constexpr auto c_warp_y_lengths =
-            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
-        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+        constexpr bool has_scales =
+            !std::is_same_v<ScaleM, EmptyScale> && !std::is_same_v<ScaleN, EmptyScale>;
+        constexpr bool has_scalar_scales =
+            std::is_same_v<ScaleM, AccDataType> && std::is_same_v<ScaleN, AccDataType>;
+        auto scale_m_window = [&]() {
+            if constexpr(has_scalar_scales)
+            {
+                return scale_m;
+            }
+            else if constexpr(has_scales)
+            {
+                return make_tile_window(scale_m, lds_tile.get_tile_distribution());
+            }
+            else
+            {
+                return EmptyScale{};
+            }
+        }();
+        auto scale_n_window = [&]() {
+            if constexpr(has_scalar_scales)
+            {
+                return scale_n;
+            }
+            else if constexpr(has_scales)
+            {
+                return make_tile_window(scale_n, lds_tile.get_tile_distribution());
+            }
+            else
+            {
+                return EmptyScale{};
+            }
+        }();
 
         static_for<0, num_access, 1>{}([&](auto iAccess) {
             block_sync_lds();
-            constexpr auto idx_y_start = SFC::get_index(iAccess);
+            slice_acc_tile<iAccess>(o_acc_tile, lds_tile);
 
-            constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (MPerIterationShuffle)>{};
-            constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (NPerIterationShuffle)>{};
+            if constexpr(has_scales)
+            {
+                scale_tile<iAccess>(lds_tile, scale_m_window, scale_n_window);
+            }
 
-            lds_tile.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
-                merge_sequences(
-                    sequence<mIter * NumMXdlPerWavePerShuffle, nIter * NumNXdlPerWavePerShuffle>{},
-                    c_warp_y_index_zeros),
-                merge_sequences(sequence<NumMXdlPerWavePerShuffle, NumNXdlPerWavePerShuffle>{},
-                                c_warp_y_lengths));
-
-            const auto c_warptile_in_tensor_casted = cast_tile<ODataType>(lds_tile);
-
-            store_tile(in_lds_window, c_warptile_in_tensor_casted);
+            cast_lds_tile(lds_tile, in_lds_window);
             block_sync_lds();
 
             auto c_out_tensor = load_tile(make_tile_window(out_lds_window, dram_tile_distribution));
 
-            const auto ds_tensor = generate_tuple(
-                [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number<NumDTensor>{});
-
-            const auto c_ds_tiles = concat_tuple_of_reference(
-                tie(c_out_tensor, c_out_tensor),
-                generate_tie([&](auto idx) -> const auto& { return ds_tensor[idx]; },
-                             number<NumDTensor>{}));
-
-            tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
-
-            if constexpr(MemoryOperation == memory_operation_enum::set)
-            {
-                store_tile(out_dram_window, c_out_tensor);
-            }
-            else
-            {
-                update_tile(out_dram_window, c_out_tensor);
-            }
-            if constexpr(iAccess != num_access - 1)
-            {
-                constexpr auto step = SFC::get_forward_step(iAccess);
-
-                move_tile_window(out_dram_window, {step.at(number<0>{}), step.at(number<1>{})});
-
-                static_for<0, NumDTensor, 1>{}([&](auto idx) {
-                    move_tile_window(d_dram_windows[idx],
-                                     {step.at(number<0>{}), step.at(number<1>{})});
-                });
-            }
+            apply_d_tensors(d_dram_windows, c_out_tensor);
+            store_to_dram(out_dram_window, c_out_tensor);
+            move_windows<iAccess>(out_dram_window, d_dram_windows);
         });
     }
 };
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index 2e907c2fa8..2843966cd7 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -28,8 +28,8 @@ struct Default2DEpilogueProblem
     static constexpr index_t NumDTensor                    = 0;
 };
 
-template <typename ADataType_,
-          typename BDataType_,
+template <typename AsDataType_,
+          typename BsDataType_,
           typename DsDataType_,
           typename AccDataType_,
           typename ODataType_,
@@ -53,8 +53,8 @@ struct DefaultGemm2DEpilogueProblem : public Default2DEpilogueProblem<AccDataTyp
                                                                       UseRawStore_,
                                                                       MemoryOperation_>
 {
-    using ADataType                        = remove_cvref_t<ADataType_>;
-    using BDataType                        = remove_cvref_t<BDataType_>;
+    using AsDataType                       = remove_cvref_t<AsDataType_>;
+    using BsDataType                       = remove_cvref_t<BsDataType_>;
     using CLayout                          = remove_cvref_t<CLayout_>;
     using DsDataType                       = remove_cvref_t<DsDataType_>;
     using CDElementwise                    = remove_cvref_t<CDElementwise_>;
@@ -91,7 +91,7 @@ struct Default2DEpilogue
     CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
                                    const OAccTile& o_acc_tile,
                                    const DsDramWindows& ds_dram_windows,
-                                   void* = nullptr)
+                                   void* = nullptr) const
     {
         const auto storeOrUpdateTile = [&](const auto& o_tile) {
             // TODO: this is ugly
@@ -157,14 +157,28 @@ struct Default2DEpilogue
 template <typename Problem_, typename Policy_ = void>
 struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
 {
-    using Problem     = remove_cvref_t<Problem_>;
-    using ADataType   = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType   = remove_cvref_t<typename Problem::BDataType>;
-    using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
-    using ODataType   = remove_cvref_t<typename Problem::ODataType>;
+    using Problem                          = remove_cvref_t<Problem_>;
+    using AsDataType                       = remove_cvref_t<typename Problem::AsDataType>;
+    using BsDataType                       = remove_cvref_t<typename Problem::BsDataType>;
+    using AccDataType                      = remove_cvref_t<typename Problem::AccDataType>;
+    using ODataType                        = remove_cvref_t<typename Problem::ODataType>;
+    static constexpr bool ADataTypeIsTuple = is_detected<is_tuple, AsDataType>::value;
+    static constexpr bool BDataTypeIsTuple = is_detected<is_tuple, BsDataType>::value;
+
+    using AsDataTypeTuple = std::conditional_t<ADataTypeIsTuple,
+                                               remove_cvref_t<AsDataType>,
+                                               remove_cvref_t<tuple<AsDataType>>>;
+
+    using BsDataTypeTuple = std::conditional_t<BDataTypeIsTuple,
+                                               remove_cvref_t<BsDataType>,
+                                               remove_cvref_t<tuple<BsDataType>>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<number<0>{}, AsDataTypeTuple>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<number<0>{}, BsDataTypeTuple>>;
     // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
     using BTypeToUse =
         std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
+
     using DsDataType                       = remove_cvref_t<typename Problem::DsDataType>;
     using DsLayout                         = remove_cvref_t<typename Problem::DsLayout>;
     using CDElementwise                    = remove_cvref_t<typename Problem::CDElementwise>;
diff --git a/include/ck_tile/ops/flatmm.hpp b/include/ck_tile/ops/flatmm.hpp
index 1714789e63..41463e6a2d 100644
--- a/include/ck_tile/ops/flatmm.hpp
+++ b/include/ck_tile/ops/flatmm.hpp
@@ -14,5 +14,7 @@
 #include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
 #include "ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
index 20ca976590..ab0b310510 100644
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -127,7 +127,10 @@ struct FlatmmKernel
         return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? dim3(kBlockSize / 2) : dim3(kBlockSize);
+    }
 
     CK_TILE_HOST static constexpr KernelArgs
     MakeKernelArgs(const FlatmmHostArgs<NumDTensor>& hostArgs)
@@ -595,8 +598,8 @@ struct FlatmmKernel
     CK_TILE_DEVICE void operator()(KernelArgs kargs) const
     {
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
         const SplitKBatchOffset splitk_batch_offset(kargs);
         // options
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 1a28366e24..0cae1a467d 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -185,11 +185,11 @@ struct FlatmmPipelineAGmemBGmemCRegV1 : public BaseFlatmmPipelineAGmemBGmemCRegV
     }
 
     template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
-    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                        const AElementFunction& a_element_func,
-                                        const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
-                                        index_t num_loop,
-                                        void* p_smem) const
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
     {
         static_assert(
             std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 3ca79fc46e..5fd1fb8d39 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -237,8 +237,12 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetKBPerLoad()
     {
-        using TileShape         = typename Problem::BlockGemmShape;
+        using TileShape = typename Problem::BlockGemmShape;
+#if defined(__gfx11__)
+        constexpr index_t scale = 4;
+#else
         constexpr index_t scale = get_warp_size() == 32 ? 2 : 1;
+#endif
         if constexpr(TileShape::WarpTile::at(I1) == 32)
         {
             return TileShape::WarpTile::at(I2) * scale / 2;
@@ -342,7 +346,7 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
+    CK_TILE_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
     {
         using TileShape = typename Problem::BlockGemmShape; // ck_tile::TileFlatmmShape
 
@@ -350,8 +354,13 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         constexpr index_t WaveSize  = get_warp_size();
         constexpr index_t WaveNum   = BlockSize / WaveSize;
 
-        constexpr index_t KBPerLoad   = GetKBPerLoad<Problem>();
-        constexpr index_t KThdPerWave = WaveSize; // threads cnt in K dim
+        constexpr index_t KBPerLoad = GetKBPerLoad<Problem>();
+#if defined(__gfx11__)
+        constexpr index_t KRepeatInWave = 2;
+#else
+        constexpr index_t KRepeatInWave = 1;
+#endif
+        constexpr index_t KThdPerWave = WaveSize / KRepeatInWave; // threads cnt in K dim
         constexpr index_t KWavePerBlk = 1;
         constexpr index_t KRepeat     = 1;
         static_assert(TileShape::flatKPerWarp == KThdPerWave * KBPerLoad, "wrong");
@@ -362,16 +371,15 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         constexpr index_t NRepeat     = 1;
 
         constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
-
         return make_static_tile_distribution(
             tile_distribution_encoding<
-                sequence<WaveRepeat>,                                          // ?
+                sequence<WaveRepeat, KRepeatInWave>,                           // ?
                 tuple<sequence<NRepeat, NWavePerBlk, NThdPerWave, NBPerLoad>,  // second direction
                       sequence<KRepeat, KWavePerBlk, KThdPerWave, KBPerLoad>>, // first  direction
                 // wave in blk,     // thd in wave
                 // <M, K>           // <M, K>
-                tuple<sequence<0, 1, 2>, sequence<1, 2>>, // which direction
-                tuple<sequence<0, 1, 1>, sequence<2, 2>>, // which index
+                tuple<sequence<0, 1, 2>, sequence<0, 1, 2>>, // which direction
+                tuple<sequence<0, 1, 1>, sequence<1, 2, 2>>, // which index
                 // <repeat, vec_load>
                 sequence<1, 1, 2, 2>,
                 sequence<0, 3, 0, 3>>{});
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 31de21a726..6b25c089bd 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -60,5 +60,7 @@
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/fmha/block/block_dropout.hpp b/include/ck_tile/ops/fmha/block/block_dropout.hpp
index e036402e16..8abdd54cd9 100644
--- a/include/ck_tile/ops/fmha/block/block_dropout.hpp
+++ b/include/ck_tile/ops/fmha/block/block_dropout.hpp
@@ -1,17 +1,44 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 
 namespace ck_tile {
 
+// BlockDropoutBwd and BlockDropout (fwd) support two warp gemm tile sizes: 32x32 (MFMA only) and
+// 16x16 (MFMA and WMMA). Even if fwd and bwd use different tile sizes, generated random
+// numbers will be the same, they are also the same for MFMA (on CDNA), WMMA (on RDNA), or host
+// (for verification, see ck_tile/host/reference/reference_batched_dropout_randval.hpp).
+//
+// The (row, col) coordinate of the current 32x32 tile in the P matrix determines a subsequence of
+// random numbers (ph_subsequence).
+// The (batch, head, 0..63) coordinate determines an offset in the subsequence (ph_head_offset and
+// ph_offset).
+// This means that subsequences are non-overlapping, reproducible and independent of mask or window.
+//
+// There are 3 modes (all produce the same results):
+//  * For 32x32 MFMA tile each of 64 lanes generates 4 * 32 bits or 16 bytes, so one warp generates
+//  the entire 32x32 tile (64 * 16 = 32 * 32).
+//  * For 16x16 MFMA tile one warp generates 1/4 of the 32x32 tile ((16 * 16) / (64 * 16) = 1/4), 4
+//  warps generate the same 64 * 16 random bytes and each uses its own quarter. If kMPerBlock >
+//  MWarp * WG::kM one warp can generate two 16x16 tiles (MIterPerWarp = 2) so fewer instructions
+//  are needed for generating a 32x32 tile.
+//  * For 16x16 WMMA tile one warp generates 1/2 of the 32x32 tile ((16 * 16) / (32 * 16) = 1/2), 2
+//  warps generate the same 64 * 16 random bytes and each uses its own half. If kMPerBlock > MWarp *
+//  WG::kM one warp can generate two 16x16 tiles.
+
+namespace detail {
+// The number of Philox 4x32 results required to fill 32x32 tile of 8-bit values
+constexpr index_t philox_per_tile = 64;
+} // namespace detail
+
 struct NullBlockDropout
 {
     template <typename BlockGemm, bool IsFwd = true, typename RandValDramBlockWindowTmp>
-    __host__ __device__ static constexpr auto
+    CK_TILE_HOST_DEVICE static constexpr auto
     MakeRandvalDramWindow(RandValDramBlockWindowTmp& randval_dram_block_window_tmp,
                           index_t seqlen_qk_start)
     {
@@ -32,7 +59,9 @@ struct BlockDropout
                                      float rp_undrop_,
                                      uint8_t p_undrop_in_uint8_t_,
                                      bool is_store_randval_)
-        : ph(seed, offset + (i_batch * nheads + i_head) * get_warp_size() + get_lane_id()),
+        : ph_seed(amd_wave_read_first_lane(seed)),
+          ph_head_offset(amd_wave_read_first_lane(offset + (i_batch * nheads + i_head) *
+                                                               detail::philox_per_tile)),
           rp_undrop(rp_undrop_),
           p_undrop_in_uint8_t(p_undrop_in_uint8_t_),
           is_store_randval(is_store_randval_)
@@ -46,11 +75,15 @@ struct BlockDropout
     {
         constexpr auto config =
             BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using WG                    = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp     = config.template at<1>();
-        constexpr index_t NWarp     = config.template at<2>();
-        constexpr index_t kMPerStep = MWarp * WG::kM;
-        constexpr index_t kNPerStep = NWarp * WG::kN;
+        using WG                       = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr bool IsWG32          = WG::kM == 32;
+        constexpr index_t MWarp        = config.template at<1>();
+        constexpr index_t NWarp        = config.template at<2>();
+        using BlockGemmShape           = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
+        constexpr index_t kMPerBlock   = BlockGemmShape::kM;
+        constexpr index_t MIterPerWarp = (!IsWG32 && kMPerBlock > MWarp * WG::kM) ? 2 : 1;
+        constexpr index_t kMPerStep    = MIterPerWarp * MWarp * WG::kM;
+        constexpr index_t kNPerStep    = NWarp * WG::kN;
 
         const auto block_origin  = randval_dram_block_window_tmp.get_window_origin();
         auto randval_dram_window = [&]() {
@@ -78,12 +111,17 @@ struct BlockDropout
     {
         constexpr auto config =
             BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using WG                    = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp     = config.template at<1>();
-        constexpr index_t kMPerStep = MWarp * WG::kM;
-        constexpr index_t kNPerStep = WG::kN;
-        constexpr index_t kN1       = 8;
-        constexpr index_t kN0       = kNPerStep / kN1;
+        using WG                       = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr bool IsWG32          = WG::kM == 32;
+        constexpr index_t MWarp        = config.template at<1>();
+        constexpr index_t NWarp        = config.template at<2>();
+        using BlockGemmShape           = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
+        constexpr index_t kMPerBlock   = BlockGemmShape::kM;
+        constexpr index_t MIterPerWarp = (!IsWG32 && kMPerBlock > MWarp * WG::kM) ? 2 : 1;
+        constexpr index_t kMPerStep    = MIterPerWarp * MWarp * WG::kM;
+        constexpr index_t kNPerStep    = NWarp * WG::kN;
+        constexpr index_t kN1          = 8;
+        constexpr index_t kN0          = kNPerStep / kN1;
 
         constexpr auto randval_lds_block_desc_0 = make_naive_tensor_descriptor(
             ck_tile::make_tuple(number<kN0>{}, number<kMPerStep>{}, number<kN1>{}),
@@ -107,33 +145,35 @@ struct BlockDropout
     {
         constexpr auto config =
             BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        constexpr index_t MWarp = config.template at<1>();
-        constexpr index_t NWarp = config.template at<2>();
-
-        constexpr index_t MIterPerWarp = 1;
+        using WG                       = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr bool IsWG32          = WG::kM == 32;
+        constexpr index_t MWarp        = config.template at<1>();
+        constexpr index_t NWarp        = config.template at<2>();
+        using BlockGemmShape           = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
+        constexpr index_t kMPerBlock   = BlockGemmShape::kM;
+        constexpr index_t MIterPerWarp = (!IsWG32 && kMPerBlock > MWarp * WG::kM) ? 2 : 1;
         constexpr index_t NIterPerWarp = 1;
 
+        // The tile distribution is different from the one in MakeRandValLdsShuffleTileDistribution,
+        // because it can combine 2 (MIterPerWarp) 16x16 subtiles for generating them at once
         constexpr auto randval_block_outer_part_dstr_encoding = tile_distribution_encoding<
             sequence<>,
-            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<MWarp, MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
             tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
+            tuple<sequence<0, 1>>,
             sequence<1, 2>,
-            sequence<0, 0>>{};
+            sequence<1, 0>>{};
 
         // Use Bwd WarpGemm to ensure that Fwd's random values ​​are consistent with Bwd.
-        constexpr auto randval_block_inner_part_dstr_encoding = []() {
-            if constexpr(std::is_same_v<typename BlockGemm::ADataType, half_t> &&
-                         std::is_same_v<typename BlockGemm::BDataType, half_t> &&
-                         std::is_same_v<typename BlockGemm::CDataType, float>)
-            {
-                return typename WarpGemmMfmaF16F16F32M32N32K16SwizzleA::CWarpDstrEncoding{};
-            }
-            else
-            {
-                return typename WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA::CWarpDstrEncoding{};
-            }
-        }();
+        constexpr auto randval_block_inner_part_dstr_encoding =
+            typename WarpGemmDispatcher<typename WG::ADataType,
+                                        typename WG::BDataType,
+                                        typename WG::CDataType,
+                                        WG::kM,
+                                        WG::kN,
+                                        WG::kK,
+                                        false,
+                                        IsWG32>::CWarpDstrEncoding{};
 
         constexpr auto randval_block_part_dstr_encode =
             detail::make_embed_tile_distribution_encoding(randval_block_outer_part_dstr_encoding,
@@ -147,11 +187,13 @@ struct BlockDropout
     {
         constexpr auto config =
             BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using WG                = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp = config.template at<1>();
-        constexpr index_t NWarp = config.template at<2>();
-
-        constexpr index_t MIterPerWarp = 1;
+        using WG                       = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr bool IsWG32          = WG::kM == 32;
+        constexpr index_t MWarp        = config.template at<1>();
+        constexpr index_t NWarp        = config.template at<2>();
+        using BlockGemmShape           = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
+        constexpr index_t kMPerBlock   = BlockGemmShape::kM;
+        constexpr index_t MIterPerWarp = (!IsWG32 && kMPerBlock > MWarp * WG::kM) ? 2 : 1;
         constexpr index_t NIterPerWarp = 1;
 
         constexpr auto randval_block_outer_part_dstr_encoding = tile_distribution_encoding<
@@ -181,14 +223,16 @@ struct BlockDropout
     {
         constexpr auto config =
             BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using WG                     = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp      = config.template at<1>();
-        constexpr index_t NWarp      = config.template at<2>();
-        using BlockGemmShape         = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
-        constexpr index_t kMPerBlock = BlockGemmShape::kM;
-        constexpr index_t kNPerBlock = BlockGemmShape::kN;
-        constexpr index_t kMPerStep  = MWarp * WG::kM;
-        constexpr index_t kNPerStep  = NWarp * WG::kN;
+        using WG                       = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr bool IsWG32          = WG::kM == 32;
+        constexpr index_t MWarp        = config.template at<1>();
+        constexpr index_t NWarp        = config.template at<2>();
+        using BlockGemmShape           = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
+        constexpr index_t kMPerBlock   = BlockGemmShape::kM;
+        constexpr index_t kNPerBlock   = BlockGemmShape::kN;
+        constexpr index_t MIterPerWarp = (!IsWG32 && kMPerBlock > MWarp * WG::kM) ? 2 : 1;
+        constexpr index_t kMPerStep    = MIterPerWarp * MWarp * WG::kM;
+        constexpr index_t kNPerStep    = NWarp * WG::kN;
 
         // randval tile in LDS
         auto randval_lds = make_tensor_view<address_space_enum::lds>(
@@ -200,42 +244,100 @@ struct BlockDropout
         // register distribute
         auto randval_dist_generated =
             make_static_distributed_tensor<uint8_t>(MakeRandValTileDistribution<BlockGemm>());
-        static_assert(randval_dist_generated.kThreadElementSpaceSize == 16);
 
-        auto randval_lds_read_window =
+        const auto randval_lds_read_window =
             make_tile_window(randval_lds_window.get_bottom_tensor_view(),
                              randval_lds_window.get_window_lengths(),
                              randval_lds_window.get_window_origin(),
                              MakeRandValLdsShuffleTileDistribution<BlockGemm>());
 
-        const int start_m0_idx = randval_dram_window.get_window_origin().at(number<0>{});
+        const index_t start_m0_idx = randval_dram_window.get_window_origin().at(number<0>{});
+        const index_t iMWarp       = get_warp_id() / NWarp;
+        const index_t iNWarp       = get_warp_id() % NWarp;
+
+        auto generate_randval = [&](auto i_m0, auto i_n0) {
+            // Generate random numbers
+            uint8_t random_uint8_t[randval_dist_generated.kThreadElementSpaceSize];
+            const index_t wg_m0 = (start_m0_idx / WG::kM) + (i_m0 * MWarp + iMWarp) * MIterPerWarp;
+            const index_t wg_n0 = (start_n0_idx / WG::kN) + (i_n0 * NWarp + iNWarp);
+            if constexpr(IsWG32)
+            {
+                // Generate the whole 32x32 tile at once (each tile consists of random numbers taken
+                // from a separate subsequence of Philox)
+                const unsigned long long ph_subsequence =
+                    bit_cast<unsigned long long>(make_uint2(wg_m0, wg_n0));
+                const index_t ph_offset = get_lane_id();
+                const ck_tile::philox ph(ph_seed, ph_head_offset + ph_offset);
+                static_assert(randval_dist_generated.kThreadElementSpaceSize == 16);
+                ph.get_random_16x8(random_uint8_t, ph_subsequence);
+            }
+            else
+            {
+                // Generate one or two 16x16 subtiles of the 32x32 tile (depending on whether
+                // MIterPerWarp is equal to 1 or 2)
+                const unsigned long long ph_subsequence =
+                    bit_cast<unsigned long long>(make_uint2(wg_m0 / 2, wg_n0 / 2));
+                const index_t subtile_m0 = wg_m0 % 2;
+                if constexpr(get_warp_size() == 32)
+                {
+                    const index_t ph_offset = (get_lane_id() & 15) +
+                                              (((get_lane_id() >> 4) & 1) << 5) +
+                                              ((wg_n0 % 2) << 4);
+                    const ck_tile::philox ph(ph_seed, ph_head_offset + ph_offset);
+                    if constexpr(MIterPerWarp == 1)
+                    {
+                        static_assert(randval_dist_generated.kThreadElementSpaceSize == 8);
+                        ph.get_random_8x8(
+                            random_uint8_t, ph_subsequence, subtile_m0 * 2 + 0, subtile_m0 * 2 + 1);
+                    }
+                    else
+                    {
+                        static_assert(randval_dist_generated.kThreadElementSpaceSize == 16);
+                        ph.get_random_16x8(random_uint8_t, ph_subsequence);
+                    }
+                }
+                else
+                {
+                    const index_t subtile_n0 = (get_lane_id() >> 4) & 1;
+                    const index_t ph_offset  = (get_lane_id() & 47) + ((wg_n0 % 2) << 4);
+                    const ck_tile::philox ph(ph_seed, ph_head_offset + ph_offset);
+                    if constexpr(MIterPerWarp == 1)
+                    {
+                        static_assert(randval_dist_generated.kThreadElementSpaceSize == 4);
+                        ph.get_random_4x8(
+                            random_uint8_t, ph_subsequence, subtile_m0 * 2 + subtile_n0);
+                    }
+                    else
+                    {
+                        static_assert(randval_dist_generated.kThreadElementSpaceSize == 8);
+                        ph.get_random_8x8(
+                            random_uint8_t, ph_subsequence, 0 * 2 + subtile_n0, 1 * 2 + subtile_n0);
+                    }
+                }
+            }
+
+            constexpr auto randval_dist_generated_spans =
+                decltype(randval_dist_generated)::get_distributed_spans();
+            int i_random_idx = 0;
+            sweep_tile_span(randval_dist_generated_spans[number<0>{}], [&](auto idx0) {
+                sweep_tile_span(randval_dist_generated_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx          = ck_tile::make_tuple(idx0, idx1);
+                    randval_dist_generated(i_j_idx) = random_uint8_t[i_random_idx++];
+                });
+            });
+            // Transpose randval using LDS
+            store_tile(randval_lds_window, randval_dist_generated);
+            block_sync_lds();
+            const auto randval = load_tile(randval_lds_read_window);
+            block_sync_lds();
+            return randval;
+        };
+
         if(is_store_randval)
         {
             static_for<0, kMPerBlock / kMPerStep, 1>{}([&](auto i_m0) {
                 static_for<0, kNPerBlock / kNPerStep, 1>{}([&](auto i_n0) {
-                    int block_row_start = (start_m0_idx / WG::kM) + (i_m0 * MWarp) + get_warp_id();
-                    int block_col_start = (start_n0_idx / WG::kN) + i_n0;
-                    uint2 rowcol        = make_uint2(block_row_start, block_col_start);
-
-                    // generate random number
-                    uint8_t random_uint8_t[16];
-                    ph.get_random_16x8(random_uint8_t,
-                                       reinterpret_cast<unsigned long long&>(rowcol));
-
-                    constexpr auto randval_dist_generated_spans =
-                        decltype(randval_dist_generated)::get_distributed_spans();
-                    int i_random_idx = 0;
-                    sweep_tile_span(randval_dist_generated_spans[number<0>{}], [&](auto idx0) {
-                        sweep_tile_span(randval_dist_generated_spans[number<1>{}], [&](auto idx1) {
-                            constexpr auto i_j_idx          = ck_tile::make_tuple(idx0, idx1);
-                            randval_dist_generated(i_j_idx) = random_uint8_t[i_random_idx++];
-                        });
-                    });
-                    // save to LDS
-                    store_tile(randval_lds_window, randval_dist_generated);
-                    block_sync_lds();
-                    // read from LDS to register
-                    auto randval = load_tile(randval_lds_read_window);
+                    const auto randval = generate_randval(i_m0, i_n0);
                     // save to Global
                     const auto randval_store = cast_tile<RandValOutputDataType>(randval);
                     store_tile(randval_dram_window, randval_store);
@@ -244,37 +346,21 @@ struct BlockDropout
                 move_tile_window(randval_dram_window, {kMPerStep, -kNPerBlock});
             });
             move_tile_window(randval_dram_window, {-kMPerBlock, kNPerBlock});
-        };
+        }
         static_for<0, kMPerBlock / kMPerStep, 1>{}([&](auto i_m0) {
             static_for<0, kNPerBlock / kNPerStep, 1>{}([&](auto i_n0) {
-                int block_row_start = (start_m0_idx / WG::kM) + (i_m0 * MWarp) + get_warp_id();
-                int block_col_start = (start_n0_idx / WG::kN) + i_n0;
-                uint2 rowcol        = make_uint2(block_row_start, block_col_start);
-
-                // generate random number
-                uint8_t random_uint8_t[16];
-                ph.get_random_16x8(random_uint8_t, reinterpret_cast<unsigned long long&>(rowcol));
-
-                constexpr auto randval_dist_generated_spans =
-                    decltype(randval_dist_generated)::get_distributed_spans();
-                int i_random_idx = 0;
-                sweep_tile_span(randval_dist_generated_spans[number<0>{}], [&](auto idx0) {
-                    sweep_tile_span(randval_dist_generated_spans[number<1>{}], [&](auto idx1) {
-                        constexpr auto i_j_idx          = ck_tile::make_tuple(idx0, idx1);
-                        randval_dist_generated(i_j_idx) = random_uint8_t[i_random_idx++];
-                    });
-                });
-                // save to LDS
-                store_tile(randval_lds_window, randval_dist_generated);
-                block_sync_lds();
-                // read from LDS to register
-                auto randval                 = load_tile(randval_lds_read_window);
+                const auto randval = generate_randval(i_m0, i_n0);
+                // Drop values of P based on the generated probabilities
                 constexpr auto randval_spans = decltype(randval)::get_distributed_spans();
                 sweep_tile_span(randval_spans[number<0>{}], [&](auto idx0) {
                     sweep_tile_span(randval_spans[number<1>{}], [&](auto idx1) {
-                        constexpr auto p_idx0 = tile_distributed_index<i_m0>{};
+                        constexpr auto p_idx0 =
+                            tile_distributed_index<i_m0 * MIterPerWarp +
+                                                   idx0.impl_.template at<0>()>{};
                         constexpr auto p_idx1 =
-                            tile_distributed_index<i_n0, idx1.impl_.at(1), idx1.impl_.at(2)>{};
+                            tile_distributed_index<i_n0,
+                                                   idx1.impl_.template at<1>(),
+                                                   idx1.impl_.template at<2>()>{};
                         constexpr auto p_idx = ck_tile::make_tuple(p_idx0, p_idx1);
                         constexpr auto r_idx = ck_tile::make_tuple(idx0, idx1);
                         p_compute(p_idx)     = randval[r_idx] <= p_undrop_in_uint8_t
@@ -286,12 +372,15 @@ struct BlockDropout
         });
     }
 
-    ck_tile::philox ph;
+    const unsigned long long ph_seed;
+    const unsigned long long ph_head_offset;
     const float rp_undrop;
     const uint8_t p_undrop_in_uint8_t;
     const bool is_store_randval;
 };
 
+// TODO: IsWG32_ is not needed as template parameter and can be removed. IsDropout_ == false can be
+// replaced with NullBlockDropout. This requires changes in xformers and other libs.
 template <bool IsDropout_, bool IsWG32_, bool IsStoreRandval_>
 struct BlockDropoutBwd;
 
@@ -301,8 +390,8 @@ struct BlockDropoutBwd<false, IsWG32_, IsStoreRandval_>
     static constexpr bool IsDropout      = false;
     static constexpr bool IsStoreRandval = IsStoreRandval_;
 
-    template <typename BlockGemm, bool IsFwd = true, typename RandValDramBlockWindowTmp>
-    __host__ __device__ static constexpr auto
+    template <typename BlockGemm, bool IsFwd = false, typename RandValDramBlockWindowTmp>
+    CK_TILE_HOST_DEVICE static constexpr auto
     MakeRandvalDramWindow(RandValDramBlockWindowTmp& randval_dram_block_window_tmp,
                           index_t seqlen_qk_start)
     {
@@ -316,10 +405,7 @@ struct BlockDropoutBwd<false, IsWG32_, IsStoreRandval_>
 template <bool IsWG32_, bool IsStoreRandval_>
 struct BlockDropoutBwd<true, IsWG32_, IsStoreRandval_>
 {
-    static constexpr bool IsDropout = true;
-    // true:  32*32 warp gemm
-    // false: 16*16 warp gemm
-    static constexpr bool IsWG32         = IsWG32_;
+    static constexpr bool IsDropout      = true;
     static constexpr bool IsStoreRandval = IsStoreRandval_;
 
     CK_TILE_HOST_DEVICE BlockDropoutBwd(index_t i_batch,
@@ -329,38 +415,30 @@ struct BlockDropoutBwd<true, IsWG32_, IsStoreRandval_>
                                         unsigned long long offset,
                                         float rp_undrop_,
                                         uint8_t p_undrop_in_uint8_t_)
-        : ph(seed,
-             offset + (i_batch * nheads + i_head) * get_warp_size() +
-                 (IsWG32 ? get_lane_id() : ((get_lane_id() & 47) + ((get_warp_id() & 1) << 4)))),
+        : ph_seed(amd_wave_read_first_lane(seed)),
+          ph_head_offset(amd_wave_read_first_lane(offset + (i_batch * nheads + i_head) *
+                                                               detail::philox_per_tile)),
           rp_undrop(rp_undrop_),
           p_undrop_in_uint8_t(p_undrop_in_uint8_t_)
     {
     }
 
-    template <typename BlockGemm, bool IsFwd = true, typename RandValDramBlockWindowTmp>
+    template <typename BlockGemm, bool IsFwd = false, typename RandValDramBlockWindowTmp>
     CK_TILE_HOST_DEVICE static constexpr auto
     MakeRandvalDramWindow(RandValDramBlockWindowTmp& randval_dram_block_window_tmp,
                           index_t seqlen_qk_start)
     {
         constexpr auto config =
             BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using BlockGemmShape                  = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
-        using WG                              = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t kMPerBlock          = BlockGemmShape::kM;
-        constexpr index_t MWarp               = config.template at<1>();
-        constexpr index_t NWarp               = config.template at<2>();
-        constexpr bool MBwdWG16MultiIterCheck = (!IsFwd) && (!IsWG32) && (kMPerBlock > 16);
-        constexpr index_t kMPerStep           = [&]() {
-            if constexpr(MBwdWG16MultiIterCheck)
-            {
-                return MWarp * WG::kM * 2;
-            }
-            else
-            {
-                return MWarp * WG::kM;
-            }
-        }();
-        constexpr index_t kNPerStep = NWarp * WG::kN;
+        using WG                       = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr bool IsWG32          = WG::kM == 32;
+        constexpr index_t MWarp        = config.template at<1>();
+        constexpr index_t NWarp        = config.template at<2>();
+        using BlockGemmShape           = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
+        constexpr index_t kMPerBlock   = BlockGemmShape::kM;
+        constexpr index_t MIterPerWarp = (!IsWG32 && kMPerBlock > MWarp * WG::kM) ? 2 : 1;
+        constexpr index_t kMPerStep    = MIterPerWarp * MWarp * WG::kM;
+        constexpr index_t kNPerStep    = NWarp * WG::kN;
 
         const auto block_origin  = randval_dram_block_window_tmp.get_window_origin();
         auto randval_dram_window = [&]() {
@@ -384,85 +462,39 @@ struct BlockDropoutBwd<true, IsWG32_, IsStoreRandval_>
     }
 
     template <typename BlockGemm>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeRandValLdsBlockDescriptor()
-    {
-        constexpr auto config =
-            BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using WG                    = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp     = config.template at<1>();
-        constexpr index_t kMPerStep = MWarp * WG::kM;
-        constexpr index_t kNPerStep = WG::kN;
-        constexpr index_t kN1       = 8;
-        constexpr index_t kN0       = kNPerStep / kN1;
-
-        constexpr auto randval_lds_block_desc_0 = make_naive_tensor_descriptor(
-            ck_tile::make_tuple(number<kN0>{}, number<kMPerStep>{}, number<kN1>{}),
-            ck_tile::make_tuple(number<(kMPerStep + 1) * kN1>{}, number<kN1>{}, number<1>{}),
-            number<kN1>{},
-            number<1>{});
-
-        constexpr auto randval_lds_block_desc = transform_tensor_descriptor(
-            randval_lds_block_desc_0,
-            ck_tile::make_tuple(
-                make_pass_through_transform(number<kMPerStep>{}),
-                make_merge_transform(ck_tile::make_tuple(number<kN0>{}, number<kN1>{}))),
-            ck_tile::make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            ck_tile::make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return randval_lds_block_desc;
-    }
-
-    template <typename BlockGemm, bool IsFwd = true>
     CK_TILE_HOST_DEVICE static constexpr auto MakeRandValTileDistribution()
     {
         constexpr auto config =
             BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using BlockGemmShape                  = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
-        constexpr index_t kMPerBlock          = BlockGemmShape::kM;
-        constexpr index_t MWarp               = config.template at<1>();
-        constexpr index_t NWarp               = config.template at<2>();
-        constexpr bool MBwdWG16MultiIterCheck = (!IsFwd) && (!IsWG32) && (kMPerBlock > 16);
-
-        constexpr index_t MIterPerWarp = [&]() {
-            if constexpr(MBwdWG16MultiIterCheck)
-            {
-                return 2;
-            }
-            else
-            {
-                return 1;
-            }
-        }();
+        using WG                       = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr bool IsWG32          = WG::kM == 32;
+        constexpr index_t MWarp        = config.template at<1>();
+        constexpr index_t NWarp        = config.template at<2>();
+        using BlockGemmShape           = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
+        constexpr index_t kMPerBlock   = BlockGemmShape::kM;
+        constexpr index_t MIterPerWarp = (!IsWG32 && kMPerBlock > MWarp * WG::kM) ? 2 : 1;
         constexpr index_t NIterPerWarp = 1;
 
         constexpr auto randval_block_outer_part_dstr_encoding = tile_distribution_encoding<
             sequence<>,
-            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<MWarp, MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
             tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
+            tuple<sequence<0, 1>>,
             sequence<1, 2>,
-            sequence<0, 0>>{};
+            sequence<1, 0>>{};
 
-        // Use Bwd WarpGemm to ensure that Fwd's random values ​​are consistent with Bwd.
-        // except headdim256.
-        constexpr auto randval_block_inner_part_dstr_encoding = []() {
-            if constexpr(std::is_same_v<typename BlockGemm::ADataType, half_t> &&
-                         std::is_same_v<typename BlockGemm::BDataType, half_t> &&
-                         std::is_same_v<typename BlockGemm::CDataType, float>)
-            {
-                if constexpr(IsWG32)
-                    return typename WarpGemmMfmaF16F16F32M32N32K16SwizzleA::CWarpDstrEncoding{};
-                else
-                    return typename WarpGemmMfmaF16F16F32M16N16K16::CWarpDstrEncoding{};
-            }
-            else
-            {
-                if constexpr(IsWG32)
-                    return typename WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA::CWarpDstrEncoding{};
-                else
-                    return typename WarpGemmMfmaBf16Bf16F32M16N16K16::CWarpDstrEncoding{};
-            }
-        }();
+        constexpr auto randval_block_inner_part_dstr_encoding =
+            typename WarpGemmDispatcher<typename WG::ADataType,
+                                        typename WG::BDataType,
+                                        typename WG::CDataType,
+                                        WG::kM,
+                                        WG::kN,
+                                        WG::kK,
+                                        false,
+                                        IsWG32>::CWarpDstrEncoding{};
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(randval_block_inner_part_dstr_encoding)>,
+                           typename WG::CWarpDstrEncoding>);
 
         constexpr auto randval_block_part_dstr_encode =
             detail::make_embed_tile_distribution_encoding(randval_block_outer_part_dstr_encoding,
@@ -471,129 +503,6 @@ struct BlockDropoutBwd<true, IsWG32_, IsStoreRandval_>
         return make_static_tile_distribution(randval_block_part_dstr_encode);
     }
 
-    template <typename BlockGemm>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeRandValLdsShuffleTileDistribution()
-    {
-        constexpr auto config =
-            BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using WG                = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp = config.template at<1>();
-        constexpr index_t NWarp = config.template at<2>();
-
-        constexpr index_t MIterPerWarp = 1;
-        constexpr index_t NIterPerWarp = 1;
-
-        constexpr auto randval_block_outer_part_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<1, 2>,
-            sequence<0, 0>>{};
-
-        constexpr auto randval_block_part_dstr_encode =
-            detail::make_embed_tile_distribution_encoding(randval_block_outer_part_dstr_encoding,
-                                                          typename WG::CWarpDstrEncoding{});
-
-        return make_static_tile_distribution(randval_block_part_dstr_encode);
-    }
-
-    template <typename BlockGemm,
-              typename PComputeDataType,
-              typename RandValOutputDataType,
-              typename PComputeWindow,
-              typename RandValDramWindow>
-    CK_TILE_HOST_DEVICE void Run(void* randval_ptr,
-                                 const index_t start_m0_idx,
-                                 const index_t start_n0_idx,
-                                 PComputeWindow& p_compute,
-                                 RandValDramWindow& randval_dram_window) const
-    {
-        constexpr auto config =
-            BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using WG                     = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp      = config.template at<1>();
-        constexpr index_t NWarp      = config.template at<2>();
-        using BlockGemmShape         = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
-        constexpr index_t kMPerBlock = BlockGemmShape::kM;
-        constexpr index_t kNPerBlock = BlockGemmShape::kN;
-        constexpr index_t kMPerStep  = MWarp * WG::kM;
-        constexpr index_t kNPerStep  = NWarp * WG::kN;
-
-        // randval tile in LDS
-        auto randval_lds = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<uint8_t*>(randval_ptr), MakeRandValLdsBlockDescriptor<BlockGemm>());
-
-        auto randval_lds_window = make_tile_window(
-            randval_lds, MakeRandValLdsBlockDescriptor<BlockGemm>().get_lengths(), {0, 0});
-
-        // register distribute
-        auto randval_dist_generated =
-            make_static_distributed_tensor<uint8_t>(MakeRandValTileDistribution<BlockGemm>());
-        static_assert(randval_dist_generated.kThreadElementSpaceSize == 16);
-
-        auto randval_lds_read_window =
-            make_tile_window(randval_lds_window.get_bottom_tensor_view(),
-                             randval_lds_window.get_window_lengths(),
-                             randval_lds_window.get_window_origin(),
-                             MakeRandValLdsShuffleTileDistribution<BlockGemm>());
-
-        static_for<0, kMPerBlock / kMPerStep, 1>{}([&](auto i_m0) {
-            static_for<0, kNPerBlock / kNPerStep, 1>{}([&](auto i_n0) {
-                int block_row_start = (start_m0_idx / WG::kM) + (i_m0 * MWarp) + get_warp_id();
-                int block_col_start = (start_n0_idx / WG::kN) + i_n0;
-                uint2 rowcol        = make_uint2(block_row_start, block_col_start);
-
-                // generate random number
-                uint8_t random_uint8_t[16];
-                ph.get_random_16x8(random_uint8_t, reinterpret_cast<unsigned long long&>(rowcol));
-
-                constexpr auto randval_dist_generated_spans =
-                    decltype(randval_dist_generated)::get_distributed_spans();
-                int i_random_idx = 0;
-                sweep_tile_span(randval_dist_generated_spans[number<0>{}], [&](auto idx0) {
-                    sweep_tile_span(randval_dist_generated_spans[number<1>{}], [&](auto idx1) {
-                        constexpr auto i_j_idx          = ck_tile::make_tuple(idx0, idx1);
-                        randval_dist_generated(i_j_idx) = random_uint8_t[i_random_idx++];
-                    });
-                });
-                // save to LDS
-                store_tile(randval_lds_window, randval_dist_generated);
-                block_sync_lds();
-                // read from LDS to register
-                auto randval                 = load_tile(randval_lds_read_window);
-                constexpr auto randval_spans = decltype(randval)::get_distributed_spans();
-                sweep_tile_span(randval_spans[number<0>{}], [&](auto idx0) {
-                    sweep_tile_span(randval_spans[number<1>{}], [&](auto idx1) {
-                        constexpr auto p_idx0 = tile_distributed_index<i_m0>{};
-                        constexpr auto p_idx1 =
-                            tile_distributed_index<i_n0, idx1.impl_.at(1), idx1.impl_.at(2)>{};
-                        constexpr auto p_idx = ck_tile::make_tuple(p_idx0, p_idx1);
-                        constexpr auto r_idx = ck_tile::make_tuple(idx0, idx1);
-                        p_compute(p_idx)     = randval[r_idx] <= p_undrop_in_uint8_t
-                                                   ? p_compute[p_idx] * rp_undrop
-                                                   : PComputeDataType(0);
-                    });
-                });
-                // save to Global
-                if constexpr(IsStoreRandval)
-                {
-                    const auto randval_store = cast_tile<RandValOutputDataType>(randval);
-                    store_tile(randval_dram_window, randval_store);
-                    move_tile_window(randval_dram_window, {0, kNPerStep});
-                }
-            });
-            if constexpr(IsStoreRandval)
-            {
-                move_tile_window(randval_dram_window, {kMPerStep, -kNPerBlock});
-            }
-        });
-        if constexpr(IsStoreRandval)
-        {
-            move_tile_window(randval_dram_window, {-kMPerBlock, kNPerBlock});
-        }
-    }
-
     template <typename BlockGemm,
               typename RandValOutputDataType,
               typename PComputeWindow,
@@ -605,92 +514,111 @@ struct BlockDropoutBwd<true, IsWG32_, IsStoreRandval_>
     {
         constexpr auto config =
             BlockGemm::Policy::template GetWarpGemmMWarpNWarp<typename BlockGemm::Problem>();
-        using WG                               = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp                = config.template at<1>();
-        constexpr index_t NWarp                = config.template at<2>();
-        using BlockGemmShape                   = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
-        constexpr index_t kMPerBlock           = BlockGemmShape::kM;
-        constexpr index_t kNPerBlock           = BlockGemmShape::kN;
-        constexpr bool MBwdWG16MultiIterCheck  = (!IsWG32) && (kMPerBlock > 16);
-        constexpr bool MBwdWG16SingleIterCheck = (!IsWG32) && (kMPerBlock == 16);
-        constexpr index_t kMPerStep            = [&]() {
-            if constexpr(MBwdWG16MultiIterCheck)
+        using WG                       = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr bool IsWG32          = WG::kM == 32;
+        constexpr index_t MWarp        = config.template at<1>();
+        constexpr index_t NWarp        = config.template at<2>();
+        using BlockGemmShape           = remove_cvref_t<typename BlockGemm::BlockGemmShape>;
+        constexpr index_t kMPerBlock   = BlockGemmShape::kM;
+        constexpr index_t kNPerBlock   = BlockGemmShape::kN;
+        constexpr index_t MIterPerWarp = (!IsWG32 && kMPerBlock > MWarp * WG::kM) ? 2 : 1;
+        constexpr index_t kMPerStep    = MIterPerWarp * MWarp * WG::kM;
+        constexpr index_t kNPerStep    = NWarp * WG::kN;
+
+        // register distribute
+        auto randval_dist_generated =
+            make_static_distributed_tensor<uint8_t>(MakeRandValTileDistribution<BlockGemm>());
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+        const index_t iNWarp = get_warp_id() % NWarp;
+
+        auto generate_randval = [&](auto i_m0, auto i_n0) {
+            // Generate random numbers
+            uint8_t random_uint8_t[randval_dist_generated.kThreadElementSpaceSize];
+            const index_t wg_m0 = (start_m0_idx / WG::kM) + (i_m0 * MWarp + iMWarp) * MIterPerWarp;
+            const index_t wg_n0 = (start_n0_idx / WG::kN) + (i_n0 * NWarp + iNWarp);
+            if constexpr(IsWG32)
             {
-                return MWarp * WG::kM * 2;
+                // Generate the whole 32x32 tile at once (each tile consists of random numbers
+                // taken from a separate subsequence of Philox)
+                const unsigned long long ph_subsequence =
+                    bit_cast<unsigned long long>(make_uint2(wg_m0, wg_n0));
+                const index_t ph_offset = get_lane_id();
+                const ck_tile::philox ph(ph_seed, ph_head_offset + ph_offset);
+                static_assert(randval_dist_generated.kThreadElementSpaceSize == 16);
+                ph.get_random_16x8(random_uint8_t, ph_subsequence);
             }
             else
             {
-                return MWarp * WG::kM;
+                // Generate one or two 16x16 subtiles of the 32x32 tile (depending on whether
+                // MIterPerWarp is equal to 1 or 2)
+                const unsigned long long ph_subsequence =
+                    bit_cast<unsigned long long>(make_uint2(wg_m0 / 2, wg_n0 / 2));
+                const index_t subtile_m0 = wg_m0 % 2;
+                if constexpr(get_warp_size() == 32)
+                {
+                    const index_t ph_offset = (get_lane_id() & 15) +
+                                              (((get_lane_id() >> 4) & 1) << 5) +
+                                              ((wg_n0 % 2) << 4);
+                    const ck_tile::philox ph(ph_seed, ph_head_offset + ph_offset);
+                    if constexpr(MIterPerWarp == 1)
+                    {
+                        static_assert(randval_dist_generated.kThreadElementSpaceSize == 8);
+                        ph.get_random_8x8(
+                            random_uint8_t, ph_subsequence, subtile_m0 * 2 + 0, subtile_m0 * 2 + 1);
+                    }
+                    else
+                    {
+                        static_assert(randval_dist_generated.kThreadElementSpaceSize == 16);
+                        ph.get_random_16x8(random_uint8_t, ph_subsequence);
+                    }
+                }
+                else
+                {
+                    const index_t subtile_n0 = (get_lane_id() >> 4) & 1;
+                    const index_t ph_offset  = (get_lane_id() & 47) + ((wg_n0 % 2) << 4);
+                    const ck_tile::philox ph(ph_seed, ph_head_offset + ph_offset);
+                    if constexpr(MIterPerWarp == 1)
+                    {
+                        static_assert(randval_dist_generated.kThreadElementSpaceSize == 4);
+                        ph.get_random_4x8(
+                            random_uint8_t, ph_subsequence, subtile_m0 * 2 + subtile_n0);
+                    }
+                    else
+                    {
+                        static_assert(randval_dist_generated.kThreadElementSpaceSize == 8);
+                        ph.get_random_8x8(
+                            random_uint8_t, ph_subsequence, 0 * 2 + subtile_n0, 1 * 2 + subtile_n0);
+                    }
+                }
             }
-        }();
-        constexpr index_t kNPerStep = NWarp * WG::kN;
 
-        // register distribute
-        auto randval = make_static_distributed_tensor<uint8_t>(
-            MakeRandValTileDistribution<BlockGemm, false>());
-        if constexpr(IsWG32)
-            static_assert(randval.kThreadElementSpaceSize == 16);
-        else
-            static_assert(randval.kThreadElementSpaceSize == 4 ||
-                          randval.kThreadElementSpaceSize == 8);
+            constexpr auto randval_dist_generated_spans =
+                decltype(randval_dist_generated)::get_distributed_spans();
+            int i_random_idx = 0;
+            sweep_tile_span(randval_dist_generated_spans[number<0>{}], [&](auto idx0) {
+                sweep_tile_span(randval_dist_generated_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx          = ck_tile::make_tuple(idx0, idx1);
+                    randval_dist_generated(i_j_idx) = random_uint8_t[i_random_idx++];
+                });
+            });
+            return randval_dist_generated;
+        };
 
         static_for<0, kNPerBlock / kNPerStep, 1>{}([&](auto i_n0) {
             static_for<0, kMPerBlock / kMPerStep, 1>{}([&](auto i_m0) {
-                int block_row_start, block_col_start;
-                if constexpr(IsWG32)
-                {
-                    block_row_start = (start_m0_idx / WG::kM) + i_m0;
-                    block_col_start = (start_n0_idx / WG::kN) + (i_n0 * NWarp) + get_warp_id();
-                }
-                else
-                {
-                    block_row_start = start_m0_idx / 32 + i_m0;
-                    block_col_start = (start_n0_idx / 32) + get_warp_id() / 2 + i_n0 * 2;
-                }
-                uint2 rowcol = make_uint2(block_row_start, block_col_start);
-
-                // generate random number
-                uint8_t* random_uint8_t_;
-                if constexpr(MBwdWG16SingleIterCheck)
-                {
-                    uint8_t random_uint8_t[4];
-                    // m0t0 ~m0t15/m0t32~m0t47: 0
-                    // m0t16~m0t31/m0t48~m0t63: 1
-                    // m1t0 ~m1t15/m1t32~m1t47: 2
-                    // m1t16~m1t31/m1t48~m1t63: 3
-                    const index_t start_idx =
-                        ((get_lane_id() >> 4) & 1) + (((start_m0_idx >> 4) & 1) << 1);
-                    ph.get_random_4x8(
-                        random_uint8_t, reinterpret_cast<unsigned long long&>(rowcol), start_idx);
-                    random_uint8_t_ = random_uint8_t;
-                }
-                else if constexpr(MBwdWG16MultiIterCheck)
-                {
-                    uint8_t random_uint8_t[8];
-                    // t0 ~t15/t32~t47: 0
-                    // t16~t31/t48~t63: 1
-                    const index_t start_idx = (get_lane_id() >> 4) & 1;
-                    ph.get_random_8x8(
-                        random_uint8_t, reinterpret_cast<unsigned long long&>(rowcol), start_idx);
-                    random_uint8_t_ = random_uint8_t;
-                }
-                else
-                {
-                    uint8_t random_uint8_t[16];
-                    ph.get_random_16x8(random_uint8_t,
-                                       reinterpret_cast<unsigned long long&>(rowcol));
-                    random_uint8_t_ = random_uint8_t;
-                }
-
+                const auto randval = generate_randval(i_m0, i_n0);
+                // Drop values of P based on the generated probabilities, negative sign is used to
+                // distinguish such values ​​later in bwd pipeline.
                 constexpr auto randval_spans = decltype(randval)::get_distributed_spans();
-                int i_random_idx             = 0;
                 sweep_tile_span(randval_spans[number<0>{}], [&](auto idx0) {
                     sweep_tile_span(randval_spans[number<1>{}], [&](auto idx1) {
-                        constexpr auto r_idx  = ck_tile::make_tuple(idx0, idx1);
-                        randval(r_idx)        = random_uint8_t_[i_random_idx++];
-                        constexpr auto p_idx0 = tile_distributed_index<i_m0 + idx0.impl_.at(0),
-                                                                       idx0.impl_.at(1),
-                                                                       idx0.impl_.at(2)>{};
+                        constexpr auto r_idx = ck_tile::make_tuple(idx0, idx1);
+                        constexpr auto p_idx0 =
+                            tile_distributed_index<i_m0 * MIterPerWarp +
+                                                       idx0.impl_.template at<0>(),
+                                                   idx0.impl_.template at<1>(),
+                                                   idx0.impl_.template at<2>()>{};
                         constexpr auto p_idx1 = tile_distributed_index<i_n0>{};
                         constexpr auto p_idx  = ck_tile::make_tuple(p_idx0, p_idx1);
                         p_compute(p_idx)      = randval[r_idx] <= p_undrop_in_uint8_t
@@ -717,7 +645,8 @@ struct BlockDropoutBwd<true, IsWG32_, IsStoreRandval_>
         }
     }
 
-    ck_tile::philox ph;
+    const unsigned long long ph_seed;
+    const unsigned long long ph_head_offset;
     const float rp_undrop;
     const uint8_t p_undrop_in_uint8_t;
 };
diff --git a/include/ck_tile/ops/fmha/block/block_masking.hpp b/include/ck_tile/ops/fmha/block/block_masking.hpp
index f5c12e11d2..2c45945fac 100644
--- a/include/ck_tile/ops/fmha/block/block_masking.hpp
+++ b/include/ck_tile/ops/fmha/block/block_masking.hpp
@@ -203,27 +203,36 @@ struct GenericAttentionMask
     CK_TILE_HOST_DEVICE constexpr auto
     IsEdgeTile(index_t i_tile_top, index_t i_tile_left, number<TileHeight>, number<TileWidth>) const
     {
-        if constexpr(IsLocal)
+        if constexpr(!IsMasking)
         {
-            // check top-right corner > x or left-borrom corner < x
-            index_t i_tile_right  = i_tile_left + TileWidth;
-            index_t i_tile_bottom = i_tile_top + TileHeight;
-            index_t x_end         = min(i_tile_top + x, x_total);
-
-            bool top_right_edge          = i_tile_right > (i_tile_top + x);
-            bool bottom_left_edge        = i_tile_bottom > (i_tile_left + y);
-            bool is_partial_out_of_bound = i_tile_right > x_end; // only consider right-pad for now
-
-            return top_right_edge || bottom_left_edge || is_partial_out_of_bound;
+            // TODO: no need to check begin
+            return (i_tile_left + TileWidth) > x_total;
         }
         else
         {
-            // only need to check top-right corner > x
-            index_t i_tile_right = i_tile_left + TileWidth;
-            index_t x_end        = min(i_tile_top + x, x_total);
+            if constexpr(IsLocal)
+            {
+                // check top-right corner > x or left-borrom corner < x
+                index_t i_tile_right  = i_tile_left + TileWidth;
+                index_t i_tile_bottom = i_tile_top + TileHeight;
+                index_t x_end         = min(i_tile_top + x, x_total);
 
-            bool top_right_edge = i_tile_right > x_end;
-            return top_right_edge;
+                bool top_right_edge   = i_tile_right > (i_tile_top + x);
+                bool bottom_left_edge = i_tile_bottom > (i_tile_left + y);
+                bool is_partial_out_of_bound =
+                    i_tile_right > x_end; // only consider right-pad for now
+
+                return top_right_edge || bottom_left_edge || is_partial_out_of_bound;
+            }
+            else
+            {
+                // only need to check top-right corner > x
+                index_t i_tile_right = i_tile_left + TileWidth;
+                index_t x_end        = min(i_tile_top + x, x_total);
+
+                bool top_right_edge = i_tile_right > x_end;
+                return top_right_edge;
+            }
         }
     }
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
index 2850ce3379..56865498c0 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -707,8 +707,8 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_q       = 0;
         long_index_t batch_offset_bias    = 0;
@@ -1143,7 +1143,7 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
                              make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
                              {i_m0, i_n1});
 
-        EpiloguePipeline{}(o_dram_window, o_acc_tile);
+        EpiloguePipeline{}(o_dram_window, o_acc_tile, nullptr);
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index c1f85cb5e6..980dfb06ae 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -60,12 +60,12 @@ struct FmhaBwdDQDKDVKernel
     using VGradDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::VGradDataType>;
     using BiasGradDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::BiasGradDataType>;
 
-    static constexpr bool kIsGroupMode = FmhaPipeline::kIsGroupMode;
-    static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
-    static constexpr auto BiasEnum     = FmhaPipeline::BiasEnum;
-    static constexpr bool kHasBiasGrad = FmhaPipeline::kHasBiasGrad;
-    using FmhaMask                     = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
+    static constexpr bool kIsGroupMode    = FmhaPipeline::kIsGroupMode;
+    static constexpr index_t kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
+    static constexpr index_t kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
+    static constexpr auto BiasEnum        = FmhaPipeline::BiasEnum;
+    static constexpr bool kHasBiasGrad    = FmhaPipeline::kHasBiasGrad;
+    using FmhaMask                    = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
     using FmhaDropout                 = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaDropout>;
     static constexpr bool kHasMask    = FmhaMask::IsMasking;
     static constexpr bool kHasDropout = FmhaDropout::IsDropout;
@@ -75,13 +75,14 @@ struct FmhaBwdDQDKDVKernel
     static constexpr index_t kMaxSeqLenQ   = FmhaPipeline::BlockFmhaShape::kMaxSeqLenQ;
     static_assert(kUseQrQtrDorPipeline == (kMaxSeqLenQ != 0));
 #if defined(__gfx950__)
-    static constexpr bool kIsAvialable = true;
+    static constexpr bool kIsAvailable = true;
 #else
-    static constexpr bool kIsAvialable = !kUseTrLoad;
+    static constexpr bool kIsAvailable = !kUseTrLoad;
 #endif
 
     // clang-format off
     template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
     template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
     template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
     // clang-format on
@@ -100,8 +101,8 @@ struct FmhaBwdDQDKDVKernel
         #define _TS_  std::to_string
         auto pn = [&] () {
             std::string n;
-            if (kPadHeadDimQ) n += "d";
-            if (kPadHeadDimV) n += "dv";
+            if (kPadHeadDimQ) n += "d" + _TS_(kPadHeadDimQ);
+            if (kPadHeadDimV) n += "dv"+ _TS_(kPadHeadDimV);
             return n.empty() ? n : std::string("p") + n; }();
         return
             _SS_("fmha_bwd_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType>::name) +
@@ -113,9 +114,11 @@ struct FmhaBwdDQDKDVKernel
             "r" + _TS_(gbr4::at(ck_tile::number<0>{})) + "x" + _TS_(gbr4::at(ck_tile::number<1>{})) + "x" + _TS_(gbr4::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt0::at(ck_tile::number<0>{})) + "x" + _TS_(gwt0::at(ck_tile::number<1>{})) + "x" + _TS_(gwt0::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt1::at(ck_tile::number<0>{})) + "x" + _TS_(gwt1::at(ck_tile::number<1>{})) + "x" + _TS_(gwt1::at(ck_tile::number<2>{})) + "_" +
-            ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn) +
+            ("o" + _TS_(kBlockPerCu)) + "_" +
+            ("maxq" + _TS_(kMaxSeqLenQ)) +
+            (pn.empty() ? "_npad" : "_" + pn) +
             (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
-            (kHasBiasGrad ? "_dbias" : "_ndbias") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kHasDropout ? "_dropout" : "_ndropout" ) +
+            (kHasBiasGrad ? "_dbias" : "_ndbias") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kHasDropout ? gwt0::at(ck_tile::number<0>{}) == 16? "_dropout_wg16":"_dropout_wg32" : "_ndropout" ) +
             (kIsStoreRandval ? "_storerandval" : "" ) + (kIsDeterministic ? "_deterministic" : "_ndeterministic" ) + (kUseTrLoad ? "_trload" : "_ntrload");
         #undef _SS_
         #undef _TS_
@@ -676,7 +679,7 @@ struct FmhaBwdDQDKDVKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        if constexpr(kIsAvialable)
+        if constexpr(kIsAvailable)
             run_(std::move(kargs));
     }
 
@@ -688,7 +691,7 @@ struct FmhaBwdDQDKDVKernel
         // divide problem
         const auto [i_tile_n, i_nhead, i_batch] = GetTileIndex();
 
-        const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN0);
+        const index_t i_n0 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN0);
 
         long_index_t batch_offset_q       = 0;
         long_index_t batch_offset_k       = 0;
@@ -813,7 +816,7 @@ struct FmhaBwdDQDKDVKernel
         const auto q_dram = pad_tensor_view(
             q_dram_naive,
             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-            sequence<false, kPadHeadDimQ>{});
+            sequence<false, (kPadHeadDimQ > 0)>{});
 
         const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
             k_ptr,
@@ -824,7 +827,7 @@ struct FmhaBwdDQDKDVKernel
         const auto k_dram = pad_tensor_view(
             k_dram_naive,
             make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-            sequence<false, kPadHeadDimQ>{});
+            sequence<false, (kPadHeadDimQ > 0)>{});
 
         const auto v_dram = [&]() {
             const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
@@ -836,7 +839,7 @@ struct FmhaBwdDQDKDVKernel
             return pad_tensor_view(
                 v_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kVHeaddim>{}),
-                sequence<false, kPadHeadDimV>{});
+                sequence<false, (kPadHeadDimV > 0)>{});
         }();
 
         // lse and d should be fine to read unpaded data as they are not on the reduction dimension
@@ -855,7 +858,7 @@ struct FmhaBwdDQDKDVKernel
         const auto do_dram = pad_tensor_view(
             do_dram_naive,
             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kVHeaddim>{}),
-            sequence<false, kPadHeadDimV>{});
+            sequence<false, (kPadHeadDimV > 0)>{});
 
         auto q_dram_window = make_tile_window(
             q_dram,
@@ -903,7 +906,7 @@ struct FmhaBwdDQDKDVKernel
             const auto dq_acc_dram = pad_tensor_view(
                 dq_acc_dram_naive,
                 make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                sequence<false, kPadHeadDimQ>{});
+                sequence<false, (kPadHeadDimQ > 0)>{});
             return make_tile_window(
                 dq_acc_dram,
                 make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
@@ -1087,7 +1090,7 @@ struct FmhaBwdDQDKDVKernel
             return pad_tensor_view(
                 dk_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                sequence<false, kPadHeadDimQ>{});
+                sequence<false, (kPadHeadDimQ > 0)>{});
         }();
 
         auto dv_dram = [&]() {
@@ -1101,7 +1104,7 @@ struct FmhaBwdDQDKDVKernel
             return pad_tensor_view(
                 dv_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kVHeaddim>{}),
-                sequence<false, kPadHeadDimV>{});
+                sequence<false, (kPadHeadDimV > 0)>{});
         }();
 
         auto dk_dram_window = make_tile_window(
@@ -1185,6 +1188,7 @@ struct FmhaBwdOGradDotOKernel
 
     // clang-format off
     template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
     template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
     template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
     // clang-format on
@@ -1336,7 +1340,7 @@ struct FmhaBwdOGradDotOKernel
         // divide problem
         const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
 
         long_index_t batch_offset_o  = 0;
         long_index_t batch_offset_do = 0;
@@ -1441,6 +1445,7 @@ struct FmhaBwdConvertQGradKernel
 
     // clang-format off
     template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
     template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
     template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
     // clang-format on
@@ -1616,7 +1621,7 @@ struct FmhaBwdConvertQGradKernel
         // divide problem
         const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
 
         long_index_t batch_offset_dq     = 0;
         long_index_t batch_offset_dq_acc = 0;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
index 66f51459af..a82d121d62 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
@@ -262,8 +262,8 @@ struct FmhaFwdAppendKVKernel
         // divide problem
         const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0);
-        const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kM0);
+        const index_t i_n0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kN0);
 
         const index_t i_cache_batch = [&, i_batch_ = i_batch] {
             if constexpr(kIsPagedKV)
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index ddc5c5447f..dafe99febe 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -65,19 +65,21 @@ struct FmhaFwdKernel
 
     static constexpr bool kUseTrLoad = FmhaPipeline::Problem::kUseTrLoad;
 #if defined(__gfx950__)
-    static constexpr bool kIsAvialable = true;
+    static constexpr bool kIsAvailable = true;
 #else
-    static constexpr bool kIsAvialable = !kUseTrLoad;
+    static constexpr bool kIsAvailable = !kUseTrLoad;
 #endif
     static constexpr std::string_view kPipelineName = FmhaPipeline::name;
 
     // clang-format off
-    template <typename T> struct t2s;
+    template <typename T1, typename T2 = T1> struct t2s;
     template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
     template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
     template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
     template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
     template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    template <> struct t2s<ck_tile::fp8_t, ck_tile::bf16_t> { static constexpr const char * name = "fp8bf16"; };
+    template <> struct t2s<ck_tile::fp8_t, ck_tile::fp32_t> { static constexpr const char * name = "fp8fp32"; };
     // clang-format on
 
     CK_TILE_HOST static std::string GetName()
@@ -99,7 +101,7 @@ struct FmhaFwdKernel
             if (kPadHeadDimV) n += "dv";
             return n.empty() ? n : std::string("p") + n; }();
         return
-            _SS_("fmha_fwd_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType>::name) +
+            _SS_("fmha_fwd_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType, ODataType>::name) +
             "_" + (kIsGroupMode ? "group" : "batch") + "_"
             "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
                     _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
@@ -291,6 +293,11 @@ struct FmhaFwdKernel
         ck_tile::index_t batch_stride_k;
         ck_tile::index_t batch_stride_v;
         ck_tile::index_t batch_stride_o;
+
+        // Optional cumulative sequence length pointers for batch mode
+        // If provided, they override seqlen_q / seqlen_k per-batch to skip tail padding.
+        const ck_tile::index_t* cu_seqlen_q_ptr  = nullptr; // cumulative, length without PAD
+        const ck_tile::index_t* cu_seqlen_kv_ptr = nullptr; // cumulative, length without PAD
     };
 
     struct FmhaFwdGroupModeKargs
@@ -310,6 +317,11 @@ struct FmhaFwdKernel
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
         const int32_t* seqlen_k_ptr;
+
+        // Optional cumulative padded sequence starts (including PAD tokens)
+        // Used solely to compute memory offsets when sequences are physically padded.
+        const int32_t* seqstart_padded_q_ptr = nullptr;
+        const int32_t* seqstart_padded_k_ptr = nullptr;
     };
 
     using Kargs = std::conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
@@ -366,7 +378,9 @@ struct FmhaFwdKernel
                   float p_drop,
                   bool s_randval,
                   std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                      drop_seed_offset)
+                      drop_seed_offset,
+                  const ck_tile::index_t* cu_seqlen_q_ptr  = nullptr,
+                  const ck_tile::index_t* cu_seqlen_kv_ptr = nullptr)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
@@ -457,6 +471,8 @@ struct FmhaFwdKernel
             kargs.init_logits_soft_cap(logits_soft_cap);
         }
 
+        kargs.cu_seqlen_q_ptr  = cu_seqlen_q_ptr;
+        kargs.cu_seqlen_kv_ptr = cu_seqlen_kv_ptr;
         return kargs;
     }
 
@@ -505,7 +521,9 @@ struct FmhaFwdKernel
               ck_tile::index_t mask_type,
               float p_drop,
               bool s_randval,
-              const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
+              const std::tuple<uint64_t, uint64_t>& drop_seed_offset,
+              const ck_tile::index_t* cu_seqlen_q_ptr  = nullptr,
+              const ck_tile::index_t* cu_seqlen_kv_ptr = nullptr)
     {
         return MakeKargsImpl(
             q_ptr,
@@ -550,7 +568,9 @@ struct FmhaFwdKernel
             mask_type,
             p_drop,
             s_randval,
-            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)),
+            cu_seqlen_q_ptr,
+            cu_seqlen_kv_ptr);
     }
 
     // std::variant<> can't take in a list initializer, overload for backward compatibility
@@ -598,7 +618,9 @@ struct FmhaFwdKernel
               ck_tile::index_t mask_type,
               float p_drop,
               bool s_randval,
-              const std::tuple<const void*, const void*>& drop_seed_offset)
+              const std::tuple<const void*, const void*>& drop_seed_offset,
+              const ck_tile::index_t* cu_seqlen_q_ptr  = nullptr,
+              const ck_tile::index_t* cu_seqlen_kv_ptr = nullptr)
     {
         return MakeKargsImpl(
             q_ptr,
@@ -643,7 +665,9 @@ struct FmhaFwdKernel
             mask_type,
             p_drop,
             s_randval,
-            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)),
+            cu_seqlen_q_ptr,
+            cu_seqlen_kv_ptr);
     }
 
     template <bool Cond = kIsGroupMode>
@@ -686,7 +710,9 @@ struct FmhaFwdKernel
                   float p_drop,
                   bool s_randval,
                   std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                      drop_seed_offset)
+                      drop_seed_offset,
+                  const void* seqstart_padded_q_ptr = nullptr,
+                  const void* seqstart_padded_k_ptr = nullptr)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
@@ -778,6 +804,8 @@ struct FmhaFwdKernel
             kargs.min_seqlen_q = min_seqlen_q;
         }
 
+        kargs.seqstart_padded_q_ptr = reinterpret_cast<const int32_t*>(seqstart_padded_q_ptr);
+        kargs.seqstart_padded_k_ptr = reinterpret_cast<const int32_t*>(seqstart_padded_k_ptr);
         return kargs;
     }
 
@@ -821,7 +849,9 @@ struct FmhaFwdKernel
               ck_tile::index_t min_seqlen_q,
               float p_drop,
               bool s_randval,
-              const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
+              const std::tuple<uint64_t, uint64_t>& drop_seed_offset,
+              const void* seqstart_padded_q_ptr = nullptr,
+              const void* seqstart_padded_k_ptr = nullptr)
     {
         return MakeKargsImpl(
             q_ptr,
@@ -861,7 +891,9 @@ struct FmhaFwdKernel
             min_seqlen_q,
             p_drop,
             s_randval,
-            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)),
+            seqstart_padded_q_ptr,
+            seqstart_padded_k_ptr);
     }
 
     // std::variant<> can't take in a list initializer, overload for backward compatibility
@@ -904,7 +936,9 @@ struct FmhaFwdKernel
               ck_tile::index_t min_seqlen_q,
               float p_drop,
               bool s_randval,
-              const std::tuple<const void*, const void*>& drop_seed_offset)
+              const std::tuple<const void*, const void*>& drop_seed_offset,
+              const void* seqstart_padded_q_ptr = nullptr,
+              const void* seqstart_padded_k_ptr = nullptr)
     {
         return MakeKargsImpl(
             q_ptr,
@@ -944,7 +978,9 @@ struct FmhaFwdKernel
             min_seqlen_q,
             p_drop,
             s_randval,
-            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)),
+            seqstart_padded_q_ptr,
+            seqstart_padded_k_ptr);
     }
 
     CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_,
@@ -1046,7 +1082,7 @@ struct FmhaFwdKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        if constexpr(kIsAvialable)
+        if constexpr(kIsAvailable)
             run_(std::move(kargs));
     }
 
@@ -1060,8 +1096,8 @@ struct FmhaFwdKernel
             // divide problem
             const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-            const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-            const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+            const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+            const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
             long_index_t batch_offset_q       = 0;
             long_index_t batch_offset_k       = 0;
@@ -1073,35 +1109,44 @@ struct FmhaFwdKernel
 
             if constexpr(kIsGroupMode)
             {
-                // get starting offset for each batch
-                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+                // logical and physical (padded) starts
+                const long_index_t query_start_unpadded = kargs.seqstart_q_ptr[i_batch];
+                const long_index_t key_start_unpadded   = kargs.seqstart_k_ptr[i_batch];
 
-                batch_offset_q = query_start * kargs.stride_q;
-                batch_offset_k = key_start * kargs.stride_k;
+                const long_index_t query_start_padded = kargs.seqstart_padded_q_ptr
+                                                            ? kargs.seqstart_padded_q_ptr[i_batch]
+                                                            : query_start_unpadded;
+                const long_index_t key_start_padded   = kargs.seqstart_padded_k_ptr
+                                                            ? kargs.seqstart_padded_k_ptr[i_batch]
+                                                            : key_start_unpadded;
+
+                // DRAM base offsets use physical padded starts
+                batch_offset_q = query_start_padded * kargs.stride_q;
+                batch_offset_k = key_start_padded * kargs.stride_k;
                 if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
                 {
-                    batch_offset_v = key_start * kargs.stride_v;
+                    batch_offset_v = key_start_padded * kargs.stride_v;
                 }
                 else
                 {
-                    batch_offset_v = key_start;
+                    batch_offset_v = key_start_padded;
                 }
                 if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                 {
-                    batch_offset_bias = query_start * kargs.stride_bias;
+                    batch_offset_bias = query_start_padded * kargs.stride_bias;
                 }
                 if constexpr(kStoreLSE)
                 {
-                    batch_offset_lse = query_start;
+                    // LSE stays indexed by unpadded starts
+                    batch_offset_lse = query_start_unpadded;
                 }
                 if constexpr(kHasDropout)
                 {
-                    batch_offset_randval = query_start * kargs.stride_randval;
+                    batch_offset_randval = query_start_padded * kargs.stride_randval;
                 }
-                batch_offset_o = query_start * kargs.stride_o;
+                batch_offset_o = query_start_padded * kargs.stride_o;
 
-                // get real # queries & # keys under group mode
+                // real logical lengths (exclude PAD)
                 const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
                 kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
 
@@ -1113,8 +1158,7 @@ struct FmhaFwdKernel
                     }
                 }
 
-                // # of required blocks is different in each groups, terminate unnecessary blocks
-                // earlier
+                // terminate unnecessary blocks earlier
                 if(kargs.seqlen_q <= i_m0)
                 {
                     return;
@@ -1150,6 +1194,18 @@ struct FmhaFwdKernel
                         static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
                 }
                 batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+
+                // If cumulative seqlen pointers are provided, override per-batch effective lengths
+                if(kargs.cu_seqlen_q_ptr != nullptr)
+                {
+                    kargs.seqlen_q =
+                        kargs.cu_seqlen_q_ptr[i_batch + 1] - kargs.cu_seqlen_q_ptr[i_batch];
+                }
+                if(kargs.cu_seqlen_kv_ptr != nullptr)
+                {
+                    kargs.seqlen_k =
+                        kargs.cu_seqlen_kv_ptr[i_batch + 1] - kargs.cu_seqlen_kv_ptr[i_batch];
+                }
             }
 
             // for simplicity, batch stride we just modify the pointer
@@ -1446,29 +1502,35 @@ struct FmhaFwdKernel
             auto o_acc_tile = [&]() {
                 if constexpr(kDoFp8StaticQuant)
                 {
-                    return FmhaPipeline{}(
-                        q_dram_window,
-                        identity{}, // q_element_func
-                        k_dram_window,
-                        identity{}, // k_element_func
-                        v_dram_window,
-                        identity{}, // v_element_func
-                        bias_dram_window,
-                        identity{}, // bias_element_func
-                        randval_dram_window,
-                        lse_dram_window,
-                        identity{},            // lse_element_func
-                        identity{},            // s_acc_element_func
-                        scales{kargs.scale_p}, // p_compute_element_func
-                        composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
-                        mask,
-                        position_encoding,
-                        kargs.scale_s,
-                        variant,
-                        variant_params,
-                        block_indices,
-                        smem_ptr,
-                        dropout);
+                    auto o_acc_element_func = [&]() {
+                        if constexpr(std::is_same_v<ODataType, ck_tile::fp8_t>)
+                            return ck_tile::composes(ck_tile::saturates<ck_tile::fp8_t>{},
+                                                     ck_tile::scales{kargs.scale_o});
+                        else
+                            return ck_tile::scales{kargs.scale_o};
+                    }();
+                    return FmhaPipeline{}(q_dram_window,
+                                          identity{}, // q_element_func
+                                          k_dram_window,
+                                          identity{}, // k_element_func
+                                          v_dram_window,
+                                          identity{}, // v_element_func
+                                          bias_dram_window,
+                                          identity{}, // bias_element_func
+                                          randval_dram_window,
+                                          lse_dram_window,
+                                          identity{},            // lse_element_func
+                                          identity{},            // s_acc_element_func
+                                          scales{kargs.scale_p}, // p_compute_element_func
+                                          o_acc_element_func,    // o_acc_element_func
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          variant,
+                                          variant_params,
+                                          block_indices,
+                                          smem_ptr,
+                                          dropout);
                 }
                 else
                 {
@@ -1542,26 +1604,35 @@ struct FmhaFwdKernel
             if constexpr(kIsGroupMode)
             {
                 // get starting offset for each batch
-                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+                const long_index_t query_start_unpadded = kargs.seqstart_q_ptr[i_batch];
+                const long_index_t key_start_unpadded   = kargs.seqstart_k_ptr[i_batch];
 
-                batch_offset_q = query_start * kargs.stride_q;
-                batch_offset_k = key_start * kargs.stride_k;
+                const long_index_t query_start_padded = kargs.seqstart_padded_q_ptr
+                                                            ? kargs.seqstart_padded_q_ptr[i_batch]
+                                                            : query_start_unpadded;
+                const long_index_t key_start_padded   = kargs.seqstart_padded_k_ptr
+                                                            ? kargs.seqstart_padded_k_ptr[i_batch]
+                                                            : key_start_unpadded;
+
+                batch_offset_q = query_start_padded * kargs.stride_q;
+                batch_offset_k = key_start_padded * kargs.stride_k;
                 if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
                 {
-                    batch_offset_v = key_start * kargs.stride_v;
+                    batch_offset_v = key_start_padded * kargs.stride_v;
                 }
                 else
                 {
-                    batch_offset_v = key_start;
+                    // col-major V: offset along seqlen dimension is scalar index
+                    batch_offset_v = key_start_padded;
                 }
                 if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                 {
-                    batch_offset_bias = query_start * kargs.stride_bias;
+                    batch_offset_bias = query_start_padded * kargs.stride_bias;
                 }
 
-                batch_offset_lse = query_start;
-                batch_offset_o   = query_start * kargs.stride_o;
+                // LSE layout is [nhead, total_seqlen], index by unpadded start
+                batch_offset_lse = query_start_unpadded;
+                batch_offset_o   = query_start_padded * kargs.stride_o;
 
                 // get real # queries & # keys under group mode
                 kargs.seqlen_q = kargs.seqstart_q_ptr[i_batch + 1] - kargs.seqstart_q_ptr[i_batch];
@@ -1599,6 +1670,18 @@ struct FmhaFwdKernel
                     batch_offset_bias =
                         static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
                 }
+
+                // If cumulative seqlen pointers are provided, override per-batch effective lengths
+                if(kargs.cu_seqlen_q_ptr != nullptr)
+                {
+                    kargs.seqlen_q =
+                        kargs.cu_seqlen_q_ptr[i_batch + 1] - kargs.cu_seqlen_q_ptr[i_batch];
+                }
+                if(kargs.cu_seqlen_kv_ptr != nullptr)
+                {
+                    kargs.seqlen_k =
+                        kargs.cu_seqlen_kv_ptr[i_batch + 1] - kargs.cu_seqlen_kv_ptr[i_batch];
+                }
             }
 
             // for simplicity, batch stride we just modify the pointer
@@ -1761,6 +1844,9 @@ struct FmhaFwdKernel
                     make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
                     sequence<false, kPadHeadDimQ>{});
 
+                constexpr auto kDramTileK =
+                    FmhaPipeline::kKLoadOnce ? FmhaPipeline::kQKHeaddim : FmhaPipeline::kK0;
+
 #if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
                 constexpr index_t LDSLayerSize  = 256 / sizeof(KDataType);
                 constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
@@ -1829,32 +1915,36 @@ struct FmhaFwdKernel
                 {
                     const auto k_dram_unmerged = transform_tensor_view(
                         k_dram_pad,
-                        make_tuple(
-                            make_pass_through_transform(height),
-                            make_unmerge_transform(make_tuple(
-                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
-                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(make_pass_through_transform(height),
+                                   make_unmerge_transform(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / kDramTileK /
+                                                         FmhaPipeline::kAlignmentK>{},
+                                                  number<kDramTileK / FmhaPipeline::kAlignmentK>{},
+                                                  number<FmhaPipeline::kAlignmentK>{}))),
                         make_tuple(sequence<0>{}, sequence<1>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+                        make_tuple(sequence<0>{}, sequence<1, 2, 3>{}));
 
                     const auto k_dram_permuted = transform_tensor_view(
                         k_dram_unmerged,
                         make_tuple(
                             make_xor_transform(make_tuple(
-                                height,
-                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{})),
+                                height, number<kDramTileK / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(
+                                number<FmhaPipeline::kQKHeaddim / kDramTileK /
+                                       FmhaPipeline::kAlignmentK>{}),
                             make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+                        make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}),
+                        make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
 
                     return transform_tensor_view(
                         k_dram_permuted,
-                        make_tuple(
-                            make_pass_through_transform(height),
-                            make_merge_transform_v3_division_mod(make_tuple(
-                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
-                                number<FmhaPipeline::kAlignmentK>{}))),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(make_pass_through_transform(height),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / kDramTileK /
+                                                         FmhaPipeline::kAlignmentK>{},
+                                                  number<kDramTileK / FmhaPipeline::kAlignmentK>{},
+                                                  number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2, 3>{}),
                         make_tuple(sequence<0>{}, sequence<1>{}));
                 }
             };
@@ -1868,7 +1958,7 @@ struct FmhaFwdKernel
                 const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
                     data, // will update this pointer if using paged-kvcache
                     make_tuple(length, kargs.hdim_v),
-                    make_tuple(kargs.hdim_v, 1),
+                    make_tuple(kargs.stride_v, 1),
                     number<FmhaPipeline::kAlignmentV>{},
                     number<1>{});
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
index 58ef6ba87e..62ac70db92 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
@@ -880,8 +880,8 @@ struct FmhaFwdPagedKVKernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_q    = 0;
         long_index_t batch_offset_k    = 0;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
index cf819c4b8d..a6fc0f1471 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
@@ -281,8 +281,8 @@ struct FmhaFwdSplitKVCombineKernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_lse_acc = 0;
         long_index_t batch_offset_o_acc   = 0;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 9293c97a31..80de65ead4 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -589,8 +589,8 @@ struct FmhaFwdSplitKVKernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_q       = 0;
         long_index_t batch_offset_k       = 0; // unused for paged-kvcache
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
index be14a36353..e9115b14df 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
@@ -81,6 +81,7 @@ struct FmhaFwdV3Kernel
         // ck_tile::index_t window_size_left, window_size_right;
         ck_tile::index_t window_size_left, window_size_right;
         ck_tile::GenericAttentionMaskEnum mask_type;
+        ck_tile::index_t remap_opt;
     };
 
     struct FmhaFwdCommonLSEKargs
@@ -99,6 +100,11 @@ struct FmhaFwdV3Kernel
         ck_tile::index_t batch_stride_k;
         ck_tile::index_t batch_stride_v;
         ck_tile::index_t batch_stride_o;
+
+        // Optional cumulative sequence length pointers for batch mode
+        // If provided, they override seqlen_q / seqlen_k per-batch to skip tail padding.
+        const ck_tile::index_t* cu_seqlen_q_ptr  = nullptr; // [batch+1]
+        const ck_tile::index_t* cu_seqlen_kv_ptr = nullptr; // [batch+1]
     };
 
     struct FmhaFwdGroupModeKargs
@@ -109,6 +115,11 @@ struct FmhaFwdV3Kernel
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
         const int32_t* seqlen_k_ptr;
+
+        // Optional cumulative padded sequence starts (including PAD tokens)
+        // Used solely to compute memory offsets when sequences are physically padded.
+        const int32_t* seqstart_padded_q_ptr = nullptr; // [batch+1]
+        const int32_t* seqstart_padded_k_ptr = nullptr; // [batch+1]
     };
 
     using Kargs = std::conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
@@ -143,7 +154,10 @@ struct FmhaFwdV3Kernel
               ck_tile::index_t batch_stride_o,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type)
+              ck_tile::index_t mask_type,
+              ck_tile::index_t remap_opt,
+              const ck_tile::index_t* cu_seqlen_q_ptr  = nullptr,
+              const ck_tile::index_t* cu_seqlen_kv_ptr = nullptr)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
@@ -176,6 +190,7 @@ struct FmhaFwdV3Kernel
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
+            kargs.remap_opt         = remap_opt;
         }
         if constexpr(kStoreLSE)
         {
@@ -184,6 +199,8 @@ struct FmhaFwdV3Kernel
             kargs.batch_stride_lse = batch_stride_lse;
         }
 
+        kargs.cu_seqlen_q_ptr  = cu_seqlen_q_ptr;
+        kargs.cu_seqlen_kv_ptr = cu_seqlen_kv_ptr;
         return kargs;
     }
 
@@ -213,7 +230,10 @@ struct FmhaFwdV3Kernel
               ck_tile::index_t nhead_stride_o,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type)
+              ck_tile::index_t mask_type,
+              ck_tile::index_t remap_opt,
+              const void* seqstart_padded_q_ptr = nullptr,
+              const void* seqstart_padded_k_ptr = nullptr)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
@@ -245,6 +265,7 @@ struct FmhaFwdV3Kernel
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
+            kargs.remap_opt         = remap_opt;
         }
         if constexpr(kStoreLSE)
         {
@@ -252,6 +273,8 @@ struct FmhaFwdV3Kernel
             kargs.nhead_stride_lse = nhead_stride_lse;
         }
 
+        kargs.seqstart_padded_q_ptr = reinterpret_cast<const int32_t*>(seqstart_padded_q_ptr);
+        kargs.seqstart_padded_k_ptr = reinterpret_cast<const int32_t*>(seqstart_padded_k_ptr);
         return kargs;
     }
 
@@ -261,39 +284,81 @@ struct FmhaFwdV3Kernel
                                                 ck_tile::index_t hdim_v_)
     {
         // TODO: this may need tuning
-        return dim3(ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
-                        ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1),
-                    nhead_,
-                    batch_size_);
-    }
-
-    CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& kargs)
-    {
-        using namespace ck_tile;
-
-        // const index_t num_tile_m0 = seqlen_q / kM0;
-        const index_t num_tile_n1 = ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
-
-        const index_t i_block = blockIdx.x;
-        const index_t i_nhead = blockIdx.y;
-        const index_t i_batch = blockIdx.z;
-
-        const auto f = [](index_t dividend, index_t divisor) {
-            index_t quotient = dividend / divisor;
-            index_t modulus  = dividend - quotient * divisor;
-            return ck_tile::make_tuple(quotient, modulus);
-        };
-
-        const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
-
         if constexpr(kHasMask)
         {
-            // assume that num_tile_n1 is always 1
-            return ck_tile::make_tuple(gridDim.x - 1 - i_tile_m, i_tile_n, i_nhead, i_batch);
+            return dim3(nhead_,
+                        ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
+                            ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1),
+                        batch_size_);
         }
         else
         {
-            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            return dim3(nhead_,
+                        ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
+                            ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1),
+                        batch_size_);
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto
+    RemapTileIndices(int32_t tg_idx, int32_t tg_idy, int32_t remap_option)
+    {
+        if(remap_option < 1)
+        {
+            return make_tuple(static_cast<int32_t>(gridDim.x - tg_idx - 1), tg_idy);
+        }
+
+        int32_t remapped_tg_idx = tg_idx;
+        int32_t remapped_tg_idy = tg_idy;
+
+        if(remap_option == 2)
+        { // special remapping
+            int32_t tmp0 = (remapped_tg_idy & 0x7) * gridDim.x + remapped_tg_idx;
+            int32_t tmp1 = tmp0 & 0x7;
+
+            remapped_tg_idx = tmp0 >> 3;
+            remapped_tg_idy = (remapped_tg_idy & 0xfffffff8) + tmp1;
+        }
+        else
+        { // normal remapping
+            int32_t cus_per_xdim_per_xcc = gridDim.x >> 3;
+            int32_t tgs_cu_id            = remapped_tg_idx >> 3;
+
+            if(tgs_cu_id < cus_per_xdim_per_xcc)
+            {
+                int32_t tgs_xcc_id = remapped_tg_idx & 0x7;
+                int32_t new_tg_idx = tgs_xcc_id * cus_per_xdim_per_xcc + tgs_cu_id;
+
+                remapped_tg_idx = new_tg_idx;
+            }
+        }
+
+        return make_tuple(remapped_tg_idx, remapped_tg_idy);
+    }
+
+    CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs&)
+    {
+        using namespace ck_tile;
+
+        // const index_t num_tile_n1 = ck_tile::integer_divide_ceil(kargs.hdim_v,
+        // FmhaPipeline::kN1);
+
+        // assume that num_tile_n1 is always 1
+        if constexpr(kHasMask)
+        {
+            const index_t i_nhead = blockIdx.x;
+            const index_t i_block = blockIdx.y;
+            const index_t i_batch = blockIdx.z;
+
+            return ck_tile::make_tuple(gridDim.y - 1 - i_block, 0, i_nhead, i_batch);
+        }
+        else
+        {
+            const index_t i_nhead = blockIdx.x;
+            const index_t i_block = blockIdx.y;
+            const index_t i_batch = blockIdx.z;
+
+            return ck_tile::make_tuple(i_block, 0, i_nhead, i_batch);
         }
     }
 
@@ -314,8 +379,8 @@ struct FmhaFwdV3Kernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_q   = 0;
         long_index_t batch_offset_k   = 0;
@@ -326,18 +391,26 @@ struct FmhaFwdV3Kernel
         if constexpr(kIsGroupMode)
         {
             // get starting offset for each batch
-            const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-            const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+            const long_index_t query_start_unpadded = kargs.seqstart_q_ptr[i_batch];
+            const long_index_t key_start_unpadded   = kargs.seqstart_k_ptr[i_batch];
 
-            batch_offset_q = query_start * kargs.stride_q;
-            batch_offset_k = key_start * kargs.stride_k;
-            batch_offset_v = key_start * kargs.stride_v;
+            const long_index_t query_start_padded = kargs.seqstart_padded_q_ptr
+                                                        ? kargs.seqstart_padded_q_ptr[i_batch]
+                                                        : query_start_unpadded;
+            const long_index_t key_start_padded   = kargs.seqstart_padded_k_ptr
+                                                        ? kargs.seqstart_padded_k_ptr[i_batch]
+                                                        : key_start_unpadded;
+
+            batch_offset_q = query_start_padded * kargs.stride_q;
+            batch_offset_k = key_start_padded * kargs.stride_k;
+            batch_offset_v = key_start_padded * kargs.stride_v;
 
             if constexpr(kStoreLSE)
             {
-                batch_offset_lse = query_start;
+                // LSE layout is [nhead, total_seqlen], index by unpadded start
+                batch_offset_lse = query_start_unpadded;
             }
-            batch_offset_o = query_start * kargs.stride_o;
+            batch_offset_o = query_start_padded * kargs.stride_o;
 
             // get real # queries & # keys under group mode
             const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
@@ -370,6 +443,18 @@ struct FmhaFwdV3Kernel
                 batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
             }
             batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+
+            // If cumulative seqlen pointers are provided, override per-batch effective lengths
+            if(kargs.cu_seqlen_q_ptr != nullptr)
+            {
+                kargs.seqlen_q =
+                    kargs.cu_seqlen_q_ptr[i_batch + 1] - kargs.cu_seqlen_q_ptr[i_batch];
+            }
+            if(kargs.cu_seqlen_kv_ptr != nullptr)
+            {
+                kargs.seqlen_k =
+                    kargs.cu_seqlen_kv_ptr[i_batch + 1] - kargs.cu_seqlen_kv_ptr[i_batch];
+            }
         }
 
         // for simplicity, batch stride we just modify the pointer
@@ -513,7 +598,7 @@ struct FmhaFwdV3Kernel
                              make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
                              {i_m0, i_n1});
 
-        EpiloguePipeline{}(o_dram_window, o_acc_tile);
+        EpiloguePipeline{}(o_dram_window, o_acc_tile, nullptr);
     }
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
index 5e63fb714a..ea024a0257 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
@@ -49,8 +49,8 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
     static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
 
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
+    static constexpr index_t kPadHeadDimQ  = Problem::kPadHeadDimQ;
+    static constexpr index_t kPadHeadDimV  = Problem::kPadHeadDimV;
     static constexpr auto BiasEnum         = Problem::BiasEnum;
     static constexpr bool kHasBiasGrad     = Problem::kHasBiasGrad;
     static constexpr bool kIsDeterministic = Problem::kIsDeterministic;
@@ -60,18 +60,18 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
     static constexpr index_t kAlignmentQ =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentQ<Problem>();
     static constexpr index_t kAlignmentK =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentK<Problem>();
     static constexpr index_t kAlignmentV =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentV<Problem>();
     static constexpr index_t kAlignmentOGrad =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentOGrad<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentOGrad<Problem>();
     static constexpr index_t kAlignmentQGrad = 1;
     static constexpr index_t kAlignmentKGrad =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentKGrad<Problem>();
     static constexpr index_t kAlignmentVGrad =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentVGrad<Problem>();
     static constexpr index_t kAlignmentBias = 1;
 
     static constexpr const char* name = "kr_ktr_vr";
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
index b883aad155..6393f227a2 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
@@ -49,8 +49,8 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
     static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
 
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
+    static constexpr index_t kPadHeadDimQ  = Problem::kPadHeadDimQ;
+    static constexpr index_t kPadHeadDimV  = Problem::kPadHeadDimV;
     static constexpr auto BiasEnum         = Problem::BiasEnum;
     static constexpr bool kHasBiasGrad     = Problem::kHasBiasGrad;
     static constexpr bool kIsDeterministic = Problem::kIsDeterministic;
@@ -60,18 +60,18 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
     static constexpr index_t kAlignmentQ =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentQ<Problem>();
     static constexpr index_t kAlignmentK =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentK<Problem>();
     static constexpr index_t kAlignmentV =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentV<Problem>();
     static constexpr index_t kAlignmentOGrad =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentOGrad<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentOGrad<Problem>();
     static constexpr index_t kAlignmentQGrad = 1;
     static constexpr index_t kAlignmentKGrad =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentKGrad<Problem>();
     static constexpr index_t kAlignmentVGrad =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentVGrad<Problem>();
     static constexpr index_t kAlignmentBias = 1;
 
     static constexpr const char* name = "kr_ktr_vr_iglp";
@@ -559,6 +559,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
                 auto shuffled_bias_tile = make_static_distributed_tensor<BiasDataType>(
                     Policy::template MakeShuffledBiasTileDistribution<Problem>());
                 shuffle_tile(shuffled_bias_tile, bias_tile);
+                // SGrad and Bias use the same address in LDS, finish loading ds on the previous
+                // iteration to reuse LDS.
+                block_sync_lds();
                 store_tile(bias_lds_write_window, shuffled_bias_tile);
                 block_sync_lds();
                 auto bias_s_tile = load_tile(bias_s_lds_read_window);
@@ -814,6 +817,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
             auto shuffled_bias_tile = make_static_distributed_tensor<BiasDataType>(
                 Policy::template MakeShuffledBiasTileDistribution<Problem>());
             shuffle_tile(shuffled_bias_tile, bias_tile);
+            // SGrad and Bias use the same address in LDS, finish loading ds in the hot loop to
+            // reuse LDS.
+            block_sync_lds();
             store_tile(bias_lds_write_window, shuffled_bias_tile);
             block_sync_lds();
             auto bias_s_tile = load_tile(bias_s_lds_read_window);
@@ -956,6 +962,8 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
                     return cast_tile<BiasGradDataType>(ds);
                 }
             }();
+            // Finish loading bias_s to reuse LDS.
+            block_sync_lds();
             store_tile(bias_lds_write_window, dbias);
             block_sync_lds();
             auto shuffled_dbias_tile = load_tile(dbias_lds_read_window);
@@ -975,11 +983,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
 
         gemm_3(dk_acc, dst_reg_tensor, qt_reg_tensor);
 
-        if constexpr(kHasBiasGrad)
-        {
-            // SGrad and BiasGrad use the same address in LDS.
-            block_sync_lds();
-        }
+        // SGrad and Bias/BiasGrad use the same address in LDS, finish loading bias/dbias or, when
+        // bias is not used, loading ds in the hot loop to reuse LDS.
+        block_sync_lds();
         store_tile(ds_lds_window, ds_gemm);
 
         block_sync_lds();
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
index c3e84df934..abe024ced1 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
@@ -14,7 +14,8 @@ namespace ck_tile {
 template <typename Problem, typename Policy>
 class BlockFmhaBwdDQDKDVPipelineSelector
 {
-    static constexpr bool has_dpad = Problem::Traits::kPadHeadDimQ || Problem::Traits::kPadHeadDimV;
+    static constexpr bool has_dpad1 =
+        Problem::Traits::kPadHeadDimQ == 1 || Problem::Traits::kPadHeadDimV == 1;
     static constexpr bool is_decode = Problem::BlockFmhaShape::kMaxSeqLenQ > 0;
 
     public:
@@ -24,7 +25,7 @@ class BlockFmhaBwdDQDKDVPipelineSelector
                            std::conditional_t<is_decode,
                                               BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR<TS...>,
                                               BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR<TS...>>,
-                           std::conditional_t<has_dpad,
+                           std::conditional_t<has_dpad1,
                                               BlockFmhaBwdDQDKDVPipelineKRKTRVR<TS...>,
                                               BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP<TS...>>>;
     using type = std::conditional_t<std::is_same_v<Policy, void>, //
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
index 9bd78b4077..5cdb4fe1d7 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
@@ -49,8 +49,8 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
     static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
 
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
+    static constexpr index_t kPadHeadDimQ  = Problem::kPadHeadDimQ;
+    static constexpr index_t kPadHeadDimV  = Problem::kPadHeadDimV;
     static constexpr auto BiasEnum         = Problem::BiasEnum;
     static constexpr bool kHasBiasGrad     = Problem::kHasBiasGrad;
     static constexpr bool kIsDeterministic = Problem::kIsDeterministic;
@@ -60,18 +60,18 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
     static constexpr index_t kAlignmentQ =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentQ<Problem>();
     static constexpr index_t kAlignmentK =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentK<Problem>();
     static constexpr index_t kAlignmentV =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentV<Problem>();
     static constexpr index_t kAlignmentOGrad =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentOGrad<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentOGrad<Problem>();
     static constexpr index_t kAlignmentQGrad = 1;
     static constexpr index_t kAlignmentKGrad =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentKGrad<Problem>();
     static constexpr index_t kAlignmentVGrad =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentVGrad<Problem>();
     static constexpr index_t kAlignmentBias = 1;
 
     static constexpr const char* name = "trload_kr_ktr_vr";
@@ -103,27 +103,41 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
         const auto do_lds_ptr0 = reinterpret_cast<OGradDataType*>(smem_ptr_);
         const auto do_lds_ptr1 = reinterpret_cast<OGradDataType*>(
             smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>());
-        const auto q_lds_ptr0  = reinterpret_cast<QDataType*>( //
+        const auto q_lds_ptr0   = reinterpret_cast<QDataType*>( //
             smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
             Policy::template GetSmemSizeOGrad<Problem>());
-        const auto q_lds_ptr1  = reinterpret_cast<QDataType*>( //
+        const auto q_lds_ptr1   = reinterpret_cast<QDataType*>( //
             smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
             Policy::template GetSmemSizeOGrad<Problem>() +
             Policy::template GetSmemSizeQ<Problem>());
-        const auto lse_lds_ptr = reinterpret_cast<LSEDataType*>(
+        const auto lse_lds_ptr0 = reinterpret_cast<LSEDataType*>(
             smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
             Policy::template GetSmemSizeOGrad<Problem>() +
             Policy::template GetSmemSizeQ<Problem>() + Policy::template GetSmemSizeQ<Problem>());
-        const auto d_lds_ptr = reinterpret_cast<DDataType*>(
+        const auto lse_lds_ptr1 = reinterpret_cast<LSEDataType*>(
             smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
             Policy::template GetSmemSizeOGrad<Problem>() +
             Policy::template GetSmemSizeQ<Problem>() + Policy::template GetSmemSizeQ<Problem>() +
             Policy::template GetSmemSizeLSE<Problem>());
+        const auto d_lds_ptr0 = reinterpret_cast<DDataType*>(
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeQ<Problem>() + Policy::template GetSmemSizeQ<Problem>() +
+            Policy::template GetSmemSizeLSE<Problem>() +
+            Policy::template GetSmemSizeLSE<Problem>());
+        const auto d_lds_ptr1 = reinterpret_cast<DDataType*>(
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeQ<Problem>() + Policy::template GetSmemSizeQ<Problem>() +
+            Policy::template GetSmemSizeLSE<Problem>() +
+            Policy::template GetSmemSizeLSE<Problem>() + Policy::template GetSmemSizeD<Problem>());
         const auto ds_lds_ptr = reinterpret_cast<GemmDataType*>(
             smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
             Policy::template GetSmemSizeOGrad<Problem>() +
             Policy::template GetSmemSizeQ<Problem>() + Policy::template GetSmemSizeQ<Problem>() +
-            Policy::template GetSmemSizeLSE<Problem>() + Policy::template GetSmemSizeD<Problem>());
+            Policy::template GetSmemSizeLSE<Problem>() +
+            Policy::template GetSmemSizeLSE<Problem>() + Policy::template GetSmemSizeD<Problem>() +
+            Policy::template GetSmemSizeD<Problem>());
         const auto bias_lds_ptr = reinterpret_cast<BiasDataType*>(ds_lds_ptr);
         return run(k_lds_ptr,
                    v_lds_ptr,
@@ -131,8 +145,10 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
                    do_lds_ptr1,
                    q_lds_ptr0,
                    q_lds_ptr1,
-                   lse_lds_ptr,
-                   d_lds_ptr,
+                   lse_lds_ptr0,
+                   lse_lds_ptr1,
+                   d_lds_ptr0,
+                   d_lds_ptr1,
                    ds_lds_ptr,
                    bias_lds_ptr,
                    std::forward<Ts>(args)...);
@@ -156,8 +172,10 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
         OGradDataType* __restrict__ do_lds_ptr1,
         QDataType* __restrict__ q_lds_ptr0,
         QDataType* __restrict__ q_lds_ptr1,
-        LSEDataType* __restrict__ lse_lds_ptr,
-        DDataType* __restrict__ d_lds_ptr,
+        LSEDataType* __restrict__ lse_lds_ptr0,
+        LSEDataType* __restrict__ lse_lds_ptr1,
+        DDataType* __restrict__ d_lds_ptr0,
+        DDataType* __restrict__ d_lds_ptr1,
         GemmDataType* __restrict__ ds_lds_ptr,
         BiasDataType* __restrict__ bias_lds_ptr,
         const QDramBlockWindowTmp& q_dram_block_window_tmp,
@@ -389,38 +407,38 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
                       "BiasDataType and BiasGradDataType should be the same!");
 
         // LSE: HBM -> LDS ->Reg
-        auto lse_dram_window = make_tile_window(
-            lse_dram_block_window_tmp.get_bottom_tensor_view(),
-            lse_dram_block_window_tmp.get_window_lengths(),
-            {seqlen_q_start},
-            Policy::template MakeLSEDDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto lse_dram_window =
+            make_tile_window(lse_dram_block_window_tmp.get_bottom_tensor_view(),
+                             lse_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_q_start},
+                             Policy::template MakeLSEDDramTileDistribution<Problem>());
 
         auto lse_lds = make_tensor_view<address_space_enum::lds>(
-            lse_lds_ptr, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
+            lse_lds_ptr0, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
 
         auto lse_lds_write_window = make_tile_window(lse_lds, make_tuple(number<kM0>{}), {0});
 
-        auto lse_lds_read_window = make_tile_window(
-            lse_lds,
-            make_tuple(number<kM0>{}),
-            {0},
-            Policy::template MakeLSEDLdsReadBlockDescriptor<Problem, decltype(gemm_0)>());
+        auto lse_lds_read_window =
+            make_tile_window(lse_lds,
+                             make_tuple(number<kM0>{}),
+                             {0},
+                             Policy::template MakeLSEDLdsReadBlockDescriptor<Problem>());
 
         // D: HBM ->Reg
-        auto d_dram_window = make_tile_window(
-            d_dram_block_window_tmp.get_bottom_tensor_view(),
-            d_dram_block_window_tmp.get_window_lengths(),
-            {seqlen_q_start},
-            Policy::template MakeLSEDDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto d_dram_window =
+            make_tile_window(d_dram_block_window_tmp.get_bottom_tensor_view(),
+                             d_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_q_start},
+                             Policy::template MakeLSEDDramTileDistribution<Problem>());
 
         auto d_lds = make_tensor_view<address_space_enum::lds>(
-            d_lds_ptr, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
+            d_lds_ptr0, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
         auto d_lds_write_window = make_tile_window(d_lds, make_tuple(number<kM0>{}), {0});
-        auto d_lds_read_window  = make_tile_window(
-            d_lds,
-            make_tuple(number<kM0>{}),
-            {0},
-            Policy::template MakeLSEDLdsReadBlockDescriptor<Problem, decltype(gemm_0)>());
+        auto d_lds_read_window =
+            make_tile_window(d_lds,
+                             make_tuple(number<kM0>{}),
+                             {0},
+                             Policy::template MakeLSEDLdsReadBlockDescriptor<Problem>());
 
         // RandVal: HBM ->Reg
         auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0), false>(
@@ -471,27 +489,31 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
         decltype(gemm_2.MakeCBlockTile()) dp_acc, ds;
         decltype(gemm_4.MakeCBlockTile()) dq_acc;
 
-        decltype(load_tile(lse_dram_window)) lse_block_tile;
-        decltype(load_tile(d_dram_window)) d_block_tile;
-
         index_t i_total_bodys = 0;
         auto main_body_impl   = [&](auto is_prologue_,
                                   auto is_epilogue_,
                                   QDataType* const __restrict__ q_lds_ptr_curr,
                                   QDataType* const __restrict__ q_lds_ptr_next,
                                   OGradDataType* const __restrict__ do_lds_ptr_curr,
-                                  OGradDataType* const __restrict__ do_lds_ptr_next) mutable {
+                                  OGradDataType* const __restrict__ do_lds_ptr_next,
+                                  LSEDataType* const __restrict__ lse_lds_ptr_curr,
+                                  LSEDataType* const __restrict__ lse_lds_ptr_next,
+                                  DDataType* const __restrict__ d_lds_ptr_curr,
+                                  DDataType* const __restrict__ d_lds_ptr_next
+
+                                  ) mutable {
             constexpr bool is_prologue = is_prologue_.value;
             constexpr bool is_epilogue = is_epilogue_.value;
             static_assert(is_prologue || is_epilogue, "is_prologue or is_epilogue should be true");
             constexpr bool is_main_body = is_prologue && is_epilogue;
-
             if constexpr(is_prologue)
             {
-                lse_block_tile = load_tile(lse_dram_window);
+                lse_lds_write_window.set_bottom_tensor_view_data_ptr(lse_lds_ptr_next);
+                async_load_tile(lse_lds_write_window, lse_dram_window);
                 move_tile_window(lse_dram_window, {kM0});
 
-                d_block_tile = load_tile(d_dram_window);
+                d_lds_write_window.set_bottom_tensor_view_data_ptr(d_lds_ptr_next);
+                async_load_tile(d_lds_write_window, d_dram_window);
                 move_tile_window(d_dram_window, {kM0});
 
                 q_lds_write_window.set_bottom_tensor_view_data_ptr(q_lds_ptr_next);
@@ -510,6 +532,13 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
                 dot_lds_read_window.set_bottom_tensor_view_data_ptr(do_lds_ptr_curr);
                 dot_reg_tensor = load_tile_transpose(dot_lds_read_window);
             }
+            if constexpr(is_epilogue)
+            {
+                lse_lds_read_window.set_bottom_tensor_view_data_ptr(lse_lds_ptr_curr);
+                lse = load_tile(lse_lds_read_window);
+                d_lds_read_window.set_bottom_tensor_view_data_ptr(d_lds_ptr_curr);
+                d = load_tile(d_lds_read_window);
+            }
             if constexpr(is_main_body)
                 Policy::template HotLoopScheduler<Problem>::SchedulerGemm0();
             __builtin_amdgcn_sched_barrier(0);
@@ -617,11 +646,6 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
             if constexpr(is_main_body)
                 Policy::template HotLoopScheduler<Problem>::SchedulerGemm12();
             __builtin_amdgcn_sched_barrier(0);
-            if constexpr(is_prologue)
-            {
-                store_tile(lse_lds_write_window, lse_block_tile);
-                store_tile(d_lds_write_window, d_block_tile);
-            }
             if constexpr(is_epilogue)
             {
                 // STAGE 5, P^T(PGrad^T - D)
@@ -674,15 +698,20 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
                 dst_reg_tensor.get_thread_buffer() = ds_gemm.get_thread_buffer();
                 gemm_3(dk_acc, dst_reg_tensor, qt_reg_tensor);
 
+                if constexpr(kHasBiasGrad)
+                {
+                    // SGrad and BiasGrad use the same address in LDS, finish loading dbias to reuse
+                    // LDS.
+                    block_sync_lds();
+                }
                 store_tile(ds_lds_window, ds_gemm);
             }
-            __builtin_amdgcn_s_waitcnt(3952);
+            s_waitcnt</*vmcnt=*/0>();
             block_sync_lds();
             if constexpr(is_prologue)
             {
                 q_lds_read_window.set_bottom_tensor_view_data_ptr(q_lds_ptr_next);
                 q_reg_tensor = load_tile(q_lds_read_window);
-                lse          = load_tile(lse_lds_read_window);
             }
             if constexpr(is_epilogue)
             {
@@ -720,7 +749,6 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
             {
                 do_lds_read_window.set_bottom_tensor_view_data_ptr(do_lds_ptr_next);
                 do_reg_tensor = load_tile(do_lds_read_window);
-                d             = load_tile(d_lds_read_window);
             }
             if constexpr(is_main_body)
                 Policy::template HotLoopScheduler<Problem>::SchedulerGemm4();
@@ -749,17 +777,25 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
         };
 
         auto main_body = [&](auto is_prologue_, auto is_epilogue_) mutable {
-            const bool is_even         = (i_total_bodys % 2 == 0);
-            const auto q_lds_ptr_curr  = is_even ? q_lds_ptr1 : q_lds_ptr0;
-            const auto q_lds_ptr_next  = is_even ? q_lds_ptr0 : q_lds_ptr1;
-            const auto do_lds_ptr_curr = is_even ? do_lds_ptr1 : do_lds_ptr0;
-            const auto do_lds_ptr_next = is_even ? do_lds_ptr0 : do_lds_ptr1;
+            const bool is_even          = (i_total_bodys % 2 == 0);
+            const auto q_lds_ptr_curr   = is_even ? q_lds_ptr1 : q_lds_ptr0;
+            const auto q_lds_ptr_next   = is_even ? q_lds_ptr0 : q_lds_ptr1;
+            const auto do_lds_ptr_curr  = is_even ? do_lds_ptr1 : do_lds_ptr0;
+            const auto do_lds_ptr_next  = is_even ? do_lds_ptr0 : do_lds_ptr1;
+            const auto lse_lds_ptr_curr = is_even ? lse_lds_ptr1 : lse_lds_ptr0;
+            const auto lse_lds_ptr_next = is_even ? lse_lds_ptr0 : lse_lds_ptr1;
+            const auto d_lds_ptr_curr   = is_even ? d_lds_ptr1 : d_lds_ptr0;
+            const auto d_lds_ptr_next   = is_even ? d_lds_ptr0 : d_lds_ptr1;
             main_body_impl(is_prologue_,
                            is_epilogue_,
                            q_lds_ptr_curr,
                            q_lds_ptr_next,
                            do_lds_ptr_curr,
-                           do_lds_ptr_next);
+                           do_lds_ptr_next,
+                           lse_lds_ptr_curr,
+                           lse_lds_ptr_next,
+                           d_lds_ptr_curr,
+                           d_lds_ptr_next);
             i_total_bodys += 1;
         };
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp
index 5adb64564d..3d5bfcc76a 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp
@@ -51,8 +51,8 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
     static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
 
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
+    static constexpr index_t kPadHeadDimQ  = Problem::kPadHeadDimQ;
+    static constexpr index_t kPadHeadDimV  = Problem::kPadHeadDimV;
     static constexpr auto BiasEnum         = Problem::BiasEnum;
     static constexpr bool kHasBiasGrad     = Problem::kHasBiasGrad;
     static constexpr bool kIsDeterministic = Problem::kIsDeterministic;
@@ -62,18 +62,18 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
     static constexpr index_t kAlignmentQ =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentQ<Problem>();
     static constexpr index_t kAlignmentK =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentK<Problem>();
     static constexpr index_t kAlignmentV =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentV<Problem>();
     static constexpr index_t kAlignmentOGrad =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentOGrad<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentOGrad<Problem>();
     static constexpr index_t kAlignmentQGrad = 1;
     static constexpr index_t kAlignmentKGrad =
-        kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
+        kPadHeadDimQ ? kPadHeadDimQ : Policy::template GetAlignmentKGrad<Problem>();
     static constexpr index_t kAlignmentVGrad =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
+        kPadHeadDimV ? kPadHeadDimV : Policy::template GetAlignmentVGrad<Problem>();
     static constexpr index_t kAlignmentBias = 1;
 
     static constexpr const char* name = "trload_kr_ktr_vr";
@@ -363,38 +363,38 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
                       "BiasDataType and BiasGradDataType should be the same!");
 
         // LSE: HBM -> LDS ->Reg
-        auto lse_dram_window = make_tile_window(
-            lse_dram_block_window_tmp.get_bottom_tensor_view(),
-            lse_dram_block_window_tmp.get_window_lengths(),
-            {0},
-            Policy::template MakeLSEDDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto lse_dram_window =
+            make_tile_window(lse_dram_block_window_tmp.get_bottom_tensor_view(),
+                             lse_dram_block_window_tmp.get_window_lengths(),
+                             {0},
+                             Policy::template MakeLSEDDramTileDistribution<Problem>());
 
         auto lse_lds = make_tensor_view<address_space_enum::lds>(
             lse_lds_ptr, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
 
         auto lse_lds_write_window = make_tile_window(lse_lds, make_tuple(number<kM0>{}), {0});
 
-        auto lse_lds_read_window = make_tile_window(
-            lse_lds,
-            make_tuple(number<kM0>{}),
-            {0},
-            Policy::template MakeLSEDLdsReadBlockDescriptor<Problem, decltype(gemm_0)>());
+        auto lse_lds_read_window =
+            make_tile_window(lse_lds,
+                             make_tuple(number<kM0>{}),
+                             {0},
+                             Policy::template MakeLSEDLdsReadBlockDescriptor<Problem>());
 
         // D: HBM ->Reg
-        auto d_dram_window = make_tile_window(
-            d_dram_block_window_tmp.get_bottom_tensor_view(),
-            d_dram_block_window_tmp.get_window_lengths(),
-            {0},
-            Policy::template MakeLSEDDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto d_dram_window =
+            make_tile_window(d_dram_block_window_tmp.get_bottom_tensor_view(),
+                             d_dram_block_window_tmp.get_window_lengths(),
+                             {0},
+                             Policy::template MakeLSEDDramTileDistribution<Problem>());
 
         auto d_lds = make_tensor_view<address_space_enum::lds>(
             d_lds_ptr, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
         auto d_lds_write_window = make_tile_window(d_lds, make_tuple(number<kM0>{}), {0});
-        auto d_lds_read_window  = make_tile_window(
-            d_lds,
-            make_tuple(number<kM0>{}),
-            {0},
-            Policy::template MakeLSEDLdsReadBlockDescriptor<Problem, decltype(gemm_0)>());
+        auto d_lds_read_window =
+            make_tile_window(d_lds,
+                             make_tuple(number<kM0>{}),
+                             {0},
+                             Policy::template MakeLSEDLdsReadBlockDescriptor<Problem>());
 
         // RandVal: HBM ->Reg
         auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0), true>(
@@ -489,7 +489,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
                 move_tile_window(k_dram_window, {kN0, 0});
                 async_load_tile(v_lds_write_window, v_dram_window);
                 move_tile_window(v_dram_window, {kN0, 0});
-                // __builtin_amdgcn_s_waitcnt(0);
+                s_waitcnt</*vmcnt=*/0>();
                 k_reg_tensor  = load_tile(k_lds_read_window);
                 v_reg_tensor  = load_tile(v_lds_read_window);
                 kt_reg_tensor = load_tile_transpose(kt_lds_read_window);
@@ -636,7 +636,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
                         }
                     }();
                     store_tile(bias_lds_write_window, dbias);
-                    __builtin_amdgcn_s_waitcnt(3952);
+                    s_waitcnt</*vmcnt=*/0>();
                     block_sync_lds();
                     auto shuffled_dbias_tile = load_tile(dbias_lds_read_window);
                     auto dbias_tile          = make_static_distributed_tensor<BiasGradDataType>(
@@ -656,9 +656,15 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
                 dst_reg_tensor.get_thread_buffer() = ds_gemm.get_thread_buffer();
                 dk_acc                             = gemm_3(dst_reg_tensor, qt_reg_tensor);
 
+                if constexpr(kHasBiasGrad)
+                {
+                    // SGrad and BiasGrad use the same address in LDS, finish loading dbias to reuse
+                    // LDS.
+                    block_sync_lds();
+                }
                 store_tile(ds_lds_window, ds_gemm);
             }
-            __builtin_amdgcn_s_waitcnt(3952);
+            s_waitcnt</*vmcnt=*/0>();
             block_sync_lds();
             if constexpr(is_epilogue)
             {
@@ -707,18 +713,18 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
                     tile_elementwise_inout([&raw_scale](auto& x) { x = x * raw_scale; }, dk_acc);
                 }
 
-                dk_epilogue(dk_dram_window, dk_acc);
+                dk_epilogue(dk_dram_window, dk_acc, nullptr);
                 move_tile_window(dk_dram_window, {kN0, 0});
-                dv_epilogue(dv_dram_window, dv_acc);
+                dv_epilogue(dv_dram_window, dv_acc, nullptr);
                 move_tile_window(dv_dram_window, {kN0, 0});
             }
         };
 
         for(index_t i = 0; i < seqlen_kv_start; i += kN0)
         {
-            dk_epilogue(dk_dram_window, decltype(gemm_3.MakeCBlockTile()){0});
+            dk_epilogue(dk_dram_window, decltype(gemm_3.MakeCBlockTile()){0}, nullptr);
             move_tile_window(dk_dram_window, {kN0, 0});
-            dv_epilogue(dv_dram_window, decltype(gemm_1.MakeCBlockTile()){0});
+            dv_epilogue(dv_dram_window, decltype(gemm_1.MakeCBlockTile()){0}, nullptr);
             move_tile_window(dv_dram_window, {kN0, 0});
         }
 
@@ -740,9 +746,9 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
         const auto seqlen_kv_length = k_length.at(number<0>{});
         for(; seqlen_kv_step < seqlen_kv_length; seqlen_kv_step += kN0)
         {
-            dk_epilogue(dk_dram_window, decltype(gemm_3.MakeCBlockTile()){0});
+            dk_epilogue(dk_dram_window, decltype(gemm_3.MakeCBlockTile()){0}, nullptr);
             move_tile_window(dk_dram_window, {kN0, 0});
-            dv_epilogue(dv_dram_window, decltype(gemm_1.MakeCBlockTile()){0});
+            dv_epilogue(dv_dram_window, decltype(gemm_1.MakeCBlockTile()){0}, nullptr);
             move_tile_window(dv_dram_window, {kN0, 0});
         }
 
@@ -752,8 +758,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
                                    dq_acc);
         else
             tile_elementwise_inout([&raw_scale](auto& x) { x = x * raw_scale; }, dq_acc);
-        // static_assert(kIsDeterministic);
-        dq_epilogue(dq_dram_window, dq_acc);
+        dq_epilogue(dq_dram_window, dq_acc, nullptr);
         return;
     }
 };
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index 68ead7c765..5eac387a66 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -408,8 +408,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
 
-        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
-                     kNPerBlock * kKPerBlock)
+        if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2
         {
             return dstr;
         }
@@ -457,8 +456,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<1>, sequence<2, 0>>,
                                        sequence<1, 2>, // N0 K1
                                        sequence<0, 1>>{});
-        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
-                     kNPerBlock * kKPerBlock)
+        if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2
         {
             return dstr;
         }
@@ -507,8 +505,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
 
-        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
-                     kMPerBlock * kKPerBlock)
+        if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2
         {
             return dstr;
         }
@@ -558,8 +555,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
 
-        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
-                     kMPerBlock * kKPerBlock)
+        if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2
         {
             return dstr;
         }
@@ -1941,7 +1937,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
 
         constexpr index_t smem_size_stage0_0 = smem_size_k + smem_size_kt;
         constexpr index_t smem_size_stage0_1 = smem_size_v;
-        constexpr index_t smem_size_stage1   = smem_size_qt + smem_size_q + +smem_size_dot +
+        constexpr index_t smem_size_stage1   = smem_size_qt + smem_size_q + smem_size_dot +
                                              smem_size_do + smem_size_lse + smem_size_d +
                                              max(smem_size_bias, smem_size_ds);
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
index 99718a187f..38aff07093 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
@@ -57,13 +57,11 @@ struct BlockFmhaBwdPipelineProblem
     static constexpr bool kUseTrLoad       = kUseTrLoad_;
 
     // attributes from traits
-    static constexpr bool kPadHeadDimQ   = Traits::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV   = Traits::kPadHeadDimV;
-    static constexpr auto BiasEnum       = Traits::BiasEnum;
-    static constexpr bool kHasBiasGrad   = Traits::kHasBiasGrad;
-    static constexpr index_t kBlockPerCu = Traits::kBlockPerCu;
-    static_assert(!Traits::kPadSeqLenQ, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ");
-    static_assert(!Traits::kPadSeqLenK, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ");
+    static constexpr index_t kPadHeadDimQ = Traits::kPadHeadDimQ;
+    static constexpr index_t kPadHeadDimV = Traits::kPadHeadDimV;
+    static constexpr auto BiasEnum        = Traits::BiasEnum;
+    static constexpr bool kHasBiasGrad    = Traits::kHasBiasGrad;
+    static constexpr index_t kBlockPerCu  = Traits::kBlockPerCu;
 };
 
 template <typename ODataType_,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
index 6259e5b473..30c2c26416 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
@@ -194,13 +194,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetTransposedAlignmentOGrad()
     {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kVHeaddim;
-
-        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-
-        return total_pixels / GetAlignmentOGrad<Problem>();
+        return GetTransposedAlignmentX<typename Problem::OGradDataType>();
     }
 
     template <typename Problem>
@@ -358,11 +352,30 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                                          Problem::BlockFmhaShape::kVHeaddim>();
     }
 
-    template <typename Problem, typename BlockGemm>
+    template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLSEDDramTileDistribution()
     {
-        return BlockFmhaBwdPipelineDefaultPolicy::MakeLSEDDramTileDistribution<Problem,
-                                                                               BlockGemm>();
+        using BlockGemm         = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+        constexpr auto config   = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+
+        constexpr index_t N0 = MWarp * NWarp;
+
+        constexpr index_t M1 = kMPerBlock;
+        constexpr index_t M0 = get_warp_size() / M1;
+        static_assert(M1 <= get_warp_size() && get_warp_size() % M1 == 0,
+                      "M1 must be a factor of warp size");
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<N0, M0>,
+                                       tuple<sequence<M1, 1>>,
+                                       tuple<sequence<0>, sequence<0, 1>>,
+                                       tuple<sequence<0>, sequence<1, 0>>,
+                                       sequence<1>,
+                                       sequence<1>>{});
     }
 
     template <typename Problem>
@@ -793,9 +806,10 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
         return lsed_lds_block_desc;
     }
 
-    template <typename Problem, typename BlockGemm>
+    template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLSEDLdsReadBlockDescriptor()
     {
+        using BlockGemm         = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
         constexpr auto config   = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
         using WG                = remove_cvref_t<decltype(config.template at<0>())>;
         constexpr index_t MWarp = config.template at<1>();
@@ -984,15 +998,16 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeLSE()
     {
-        return sizeof(typename Problem::LSEDataType) *
-               MakeLSEDLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+        return static_cast<index_t>(max( //
+            sizeof(int) * get_warp_size(),
+            sizeof(typename Problem::LSEDataType) *
+                MakeLSEDLdsWriteBlockDescriptor<Problem>().get_element_space_size()));
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeD()
     {
-        return sizeof(typename Problem::DDataType) *
-               MakeLSEDLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+        return GetSmemSizeLSE<Problem>();
     }
 
     template <typename Problem>
@@ -1039,8 +1054,9 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
         constexpr index_t smem_size_bias = GetSmemSizeBias<Problem>();
 
         constexpr index_t smem_size_stage0 = smem_size_k + smem_size_v;
-        constexpr index_t smem_size_stage1 = smem_size_q * 2 + smem_size_do * 2 + smem_size_lse +
-                                             smem_size_d + max(smem_size_bias, smem_size_ds);
+        constexpr index_t smem_size_stage1 = smem_size_q * 2 + smem_size_do * 2 +
+                                             smem_size_lse * 2 + smem_size_d * 2 +
+                                             max(smem_size_bias, smem_size_ds);
         return max(smem_size_stage0, smem_size_stage1);
     }
 
@@ -1090,6 +1106,8 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
         static constexpr index_t LSE_VMEM_READ = 1;
         static constexpr index_t D_VMEM_READ   = 1;
 
+        static constexpr index_t DQ_VMEM_WRITE = kM0 * kQKHeaddim / kBlockSize; // atomic add
+
         // LDS Read
         static constexpr index_t OGradT_LDS_READ =
             kM0 * kVHeaddim / get_warp_size() / GetTransposedAlignmentOGrad<Problem>();
@@ -1116,11 +1134,12 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
             kM0 * kVHeaddim / kBlockSize / GetAlignmentOGrad<Problem>();
         static constexpr index_t OGradT_LDS_WRITE =
             kM0 * kVHeaddim / kBlockSize / GetTransposedAlignmentOGrad<Problem>();
-        static constexpr index_t LSE_LDS_WRITE    = 1;
-        static constexpr index_t D_LDS_WRITE      = 1;
         static constexpr index_t SGradT_LDS_WRITE = kM0 * kN0 / kBlockSize;
 
         public:
+        static constexpr index_t TOTAL_VMEM_READ =
+            Q_VMEM_READ + OGrad_VMEM_READ + LSE_VMEM_READ + D_VMEM_READ + DQ_VMEM_WRITE;
+
         CK_TILE_DEVICE static constexpr void SchedulerGemm0()
         {
             // Mem: Q, LSE, OGrad, D global load, OGrad^T LDS load
@@ -1128,7 +1147,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
             constexpr index_t VMEM_READ_INST =
                 Q_VMEM_READ + OGrad_VMEM_READ + LSE_VMEM_READ + D_VMEM_READ;
             constexpr index_t MFMA_INST     = Gemm0MFMA;
-            constexpr index_t LDS_READ_INST = OGradT_LDS_READ;
+            constexpr index_t LDS_READ_INST = OGradT_LDS_READ + LSE_LDS_READ + D_LDS_READ;
 
             constexpr index_t lcm_inst = lcm(VMEM_READ_INST, MFMA_INST, LDS_READ_INST);
             static_for<0, lcm_inst, 1>{}([&](auto i) {
@@ -1161,8 +1180,8 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
         {
             // Mem: LSE/D LDS store, SGradT LDS store, SGrad, Q, LSE LDS load.
             // Comp: SGradT x QT
-            constexpr index_t LDS_WRITE_INST = LSE_LDS_WRITE + D_LDS_WRITE + SGradT_LDS_WRITE;
-            constexpr index_t LDS_READ_INST  = SGradT_LDS_READ_P1 + Q_LDS_READ + LSE_LDS_READ;
+            constexpr index_t LDS_WRITE_INST = SGradT_LDS_WRITE;
+            constexpr index_t LDS_READ_INST  = SGradT_LDS_READ_P1 + Q_LDS_READ;
             constexpr index_t MFMA_INST      = Gemm3MFMA;
 
             constexpr index_t lds_rw_inst = LDS_WRITE_INST + LDS_READ_INST;
@@ -1185,7 +1204,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
         {
             // Mem: SGrad, OGrad, D LDS load.
             // Comp: SGrad x KT
-            constexpr index_t LDS_READ_INST = SGradT_LDS_READ_P2 + OGrad_LDS_READ + D_LDS_READ;
+            constexpr index_t LDS_READ_INST = SGradT_LDS_READ_P2 + OGrad_LDS_READ;
             constexpr index_t MFMA_INST     = Gemm4MFMA;
 
             constexpr index_t lcm_inst = lcm(MFMA_INST, LDS_READ_INST);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
index 9d267e1cee..b01c127a21 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
@@ -320,9 +320,9 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
                 k_block_tile = load_tile(k_dram_window);
             }
             auto physical_next_block_id_k =
-                __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
+                amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
                     i_page_block_k, k_dram_block_window, {kN0, 0}));
-            auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
+            auto physical_next_block_id_v = amd_wave_read_first_lane(
                 v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
 
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp
index 9c348495ff..f7ee88f906 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -32,12 +32,27 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVSDefaultPolicy
 
         constexpr auto warp_gemm = []() {
             constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
-            static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
 
-            if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
-                         std::is_same_v<typename Problem::KDataType, half_t> &&
+            if constexpr(std::is_same_v<typename Problem::QDataType, float> &&
+                         std::is_same_v<typename Problem::KDataType, float> &&
                          std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 16);
+
+                return WarpGemmDispatcher<typename Problem::QDataType,
+                                          typename Problem::KDataType,
+                                          typename Problem::SaccDataType,
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
+                                          true>{};
+            }
+            else if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
+                              std::is_same_v<typename Problem::KDataType, half_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{};
                 else if constexpr(WarpGemmM == 16)
@@ -49,6 +64,8 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVSDefaultPolicy
                               std::is_same_v<typename Problem::KDataType, bf16_t> &&
                               std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{};
                 else if constexpr(WarpGemmM == 16)
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
index 7ac86e6d12..7b30f36fd8 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -223,6 +223,8 @@ struct BlockFmhaFwdSplitKVCombinePipeline
             });
         }
 
+        // sync before rewriting lse_acc_lds
+        block_sync_lds();
         // store the lse scales in shared memory.
         {
             constexpr auto spans = decltype(lse_accum)::get_distributed_spans();
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index 9de640b7cf..fe5e0bc345 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -321,9 +321,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                 k_block_tile = load_tile(k_dram_window);
             }
             auto physical_next_block_id_k =
-                __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
+                amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
                     i_page_block_k, k_dram_block_window, {kN0, 0}));
-            auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
+            auto physical_next_block_id_v = amd_wave_read_first_lane(
                 v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
 
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
@@ -618,7 +618,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                                                   &i_page_block_v_ = i_page_block_v,
                                                   &v_dram_window_  = v_dram_window](auto i_k1) {
                     auto physical_next_block_id_v_ =
-                        __builtin_amdgcn_readfirstlane(v_page_block_navigator.prefetch_table_id(
+                        amd_wave_read_first_lane(v_page_block_navigator.prefetch_table_id(
                             i_page_block_v_, v_dram_window_, {0, kK1}));
                     const auto v = load_tile(v_dram_window_); // load next v
                     block_sync_lds();
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp
index 20d84116d4..5e2a4e898b 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp
@@ -57,7 +57,11 @@ struct CoreLoopScheduler<PipelineProblem, /*kIsMasking=*/true>
                     __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
                 });
             }
-            else if constexpr(Phase == 1) {}
+            else if constexpr(Phase == 1)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
+                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
+            }
             else if constexpr(Phase == 2)
             {
 #if !CK_TILE_DISABLE_PACKED_FP32
@@ -68,11 +72,19 @@ struct CoreLoopScheduler<PipelineProblem, /*kIsMasking=*/true>
                     __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
                 });
             }
-            else if constexpr(Phase == 3) {}
+            else if constexpr(Phase == 3)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
+                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
+            }
         }
         else
         {
-            if constexpr(Phase == 0) {}
+            if constexpr(Phase == 0)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
+                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
+            }
             else if constexpr(Phase == 1)
             {
                 static_for<0, 8, 1>{}([&](auto) {
@@ -81,7 +93,11 @@ struct CoreLoopScheduler<PipelineProblem, /*kIsMasking=*/true>
                     __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
                 });
             }
-            else if constexpr(Phase == 2) {}
+            else if constexpr(Phase == 2)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
+                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
+            }
             else if constexpr(Phase == 3)
             {
 #if !CK_TILE_DISABLE_PACKED_FP32
@@ -115,7 +131,11 @@ struct CoreLoopScheduler<PipelineProblem, /*kIsMasking=*/false>
                     __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
                 });
             }
-            else if constexpr(Phase == 1) {}
+            else if constexpr(Phase == 1)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
+                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
+            }
             else if constexpr(Phase == 2)
             {
 #if !CK_TILE_DISABLE_PACKED_FP32
@@ -126,11 +146,19 @@ struct CoreLoopScheduler<PipelineProblem, /*kIsMasking=*/false>
                     __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
                 });
             }
-            else if constexpr(Phase == 3) {}
+            else if constexpr(Phase == 3)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
+                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
+            }
         }
         else
         {
-            if constexpr(Phase == 0) {}
+            if constexpr(Phase == 0)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
+                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
+            }
             else if constexpr(Phase == 1)
             {
                 static_for<0, 8, 1>{}([&](auto) {
@@ -139,7 +167,11 @@ struct CoreLoopScheduler<PipelineProblem, /*kIsMasking=*/false>
                     __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
                 });
             }
-            else if constexpr(Phase == 2) {}
+            else if constexpr(Phase == 2)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
+                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
+            }
             else if constexpr(Phase == 3)
             {
 #if !CK_TILE_DISABLE_PACKED_FP32
@@ -177,6 +209,15 @@ CK_TILE_DEVICE float add_impl_vv(float lhs, float rhs)
     return result;
 }
 
+CK_TILE_DEVICE float mul_impl_vv(float lhs, float rhs)
+{
+    float result;
+    asm volatile("v_mul_f32_e32 %[result], %[lhs], %[rhs]"
+                 : [result] "=v"(result)
+                 : [lhs] "v"(lhs), [rhs] "v"(rhs));
+    return result;
+}
+
 CK_TILE_DEVICE fp16x2_t cvt_pk_fp16_f32(float a, float b)
 {
     fp16x2_t result;
@@ -466,7 +507,7 @@ struct BlockFmhaFwdV3Pipeline
         statically_indexed_array<sp_compute_type, 2> sp;
 
         decltype(gemm_1.MakeCBlockTile()) o_acc;
-        constexpr index_t fmha_alu_D_reg_cnt = 0; // threshold to decide how many fmha_alu_D_upd()
+        constexpr index_t fmha_alu_D_reg_cnt = 6; // threshold to decide how many fmha_alu_D_upd()
                                                   // instructions should we move to fmha_alu1()
         static_assert(fmha_alu_D_reg_cnt <= o_acc.thread_buf_.size());
 
@@ -631,8 +672,8 @@ struct BlockFmhaFwdV3Pipeline
 
         // K_mem_su_ld_insts = 1 for 32 x 128
         // V_mem_su_ld_insts = 1 for 128 x 32
-        static constexpr int K_mem_su_ld_insts = 1;
-        static constexpr int V_mem_su_ld_insts = 1;
+        constexpr int K_mem_su_ld_insts = k_dram_window.get_num_of_access();
+        constexpr int V_mem_su_ld_insts = v_dram_window.get_num_of_access();
 
         auto K_mem_load = [&](auto k_lds_write_idx) {
             async_load_tile_raw(k_lds_window_store(k_lds_write_idx), k_dram_window);
@@ -648,7 +689,6 @@ struct BlockFmhaFwdV3Pipeline
 
         auto V_mem_load = [&](auto v_lds_write_idx) {
             async_load_tile_raw(v_lds_window_store(v_lds_write_idx), v_dram_window);
-            __builtin_amdgcn_sched_barrier(0);
 
             /// FIXME: use the future-predicting method to move the window
             move_tile_window(v_dram_window, {kK1, 0});
@@ -726,11 +766,12 @@ struct BlockFmhaFwdV3Pipeline
 #else
             block_tile_reduce_sync(rowsum_p, f_sum, bool_constant<false>{});
 #endif
-            // update partial o_acc [0, 2)
-            static_for<0, ck_tile::min(2, fmha_alu_D_reg_cnt), 1>{}(
-                [&](auto idx) { o_acc.thread_buf_[idx] *= o_acc_scale; });
 
             // l{j}
+            /// Note: The compiler keeps moving the following instructions elsewhere because 'l'
+            /// is first consumed later. To anchor them here, we rewrite the final addition in
+            /// inline assembly to create a dependency, forcing the dependent instructions to
+            /// be emitted at this point.
             constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
             sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
                 constexpr auto i_idx = make_tuple(idx0);
@@ -739,13 +780,15 @@ struct BlockFmhaFwdV3Pipeline
                 l(i_idx) = detail::add_impl_vv(tmp * l[i_idx], rowsum_p[i_idx]);
             });
 
-            // update partial o_acc [2, fmha_alu_D_reg_cnt)
-            static_for<2, ck_tile::max(2, fmha_alu_D_reg_cnt), 1>{}(
-                [&](auto idx) { o_acc.thread_buf_[idx] *= o_acc_scale; });
+            // update partial o_acc [0, fmha_alu_D_reg_cnt)
+            static_for<0, fmha_alu_D_reg_cnt, 1>{}([&](auto idx) {
+                o_acc.thread_buf_[idx] = detail::mul_impl_vv(o_acc.thread_buf_[idx], o_acc_scale);
+            });
 
-            /// NOTICE: Compiler keep moving the conversion instructions to other places. We rewite
-            /// the cast_tile() call into inline asm to force the conversion instructions to be
-            /// generated here. The fmha_alu1() call should be placed at the end of a phase.
+            /// Note: The compiler keeps sinking the conversion instructions because the
+            /// result 'p' is only consumed later. To anchor them here, we rewrite
+            /// the cast_tile() call as inline assembly, forcing the conversions to be
+            /// emitted at this point.
             static_assert(sp(sp_reg_idx).p.thread_buf_.size() % 2 == 0);
             static_for<0, sp(sp_reg_idx).p.thread_buf_.size(), 2>{}([&](auto idx) {
                 float x = p_compute_element_func(sp(sp_reg_idx).sp_compute.thread_buf_[idx]);
@@ -763,6 +806,10 @@ struct BlockFmhaFwdV3Pipeline
                     sp(sp_reg_idx).p.thread_buf_[idx + 1] = casted.y;
                 }
             });
+
+            /// Note: Place fmha_alu1() at the end of the phase. The surrounding inline assembly
+            /// can interfere with the behavior of sched_group_barrier(), so ending the phase here
+            /// avoids unintended reordering.
         };
 
         auto gemm = [&](auto sp_reg_idx, auto gemm_idx) {
@@ -937,9 +984,9 @@ struct BlockFmhaFwdV3Pipeline
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
                     cl_load(memK, K_w0_lds_wr_idx, V_w0_lds_rd_idx);
+                    Scheduler::schedule(cl_p, number<1>{});
                     fmha_mask(xdl_SP_p01_reg_idx);
 
-                    Scheduler::schedule(cl_p, number<1>{});
                     __builtin_amdgcn_sched_barrier(0);
                     // phase2
                     ASM_MARKER("phase2 Wave0-3");
@@ -947,6 +994,8 @@ struct BlockFmhaFwdV3Pipeline
                     __builtin_amdgcn_sched_barrier(0);
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
+                    asm volatile("s_nop 0");
+                    __builtin_amdgcn_sched_barrier(0);
                     cl_calc(xdl_SP_p23_reg_idx, gemm1);
 
                     Scheduler::schedule(cl_p, number<2>{});
@@ -995,6 +1044,8 @@ struct BlockFmhaFwdV3Pipeline
                     __builtin_amdgcn_sched_barrier(0);
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
+                    asm volatile("s_nop 1");
+                    __builtin_amdgcn_sched_barrier(0);
                     cl_calc(xdl_SP_p01_reg_idx, gemm0);
                     fmha_alu1(xdl_SP_p23_reg_idx);
 
@@ -1005,9 +1056,9 @@ struct BlockFmhaFwdV3Pipeline
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
                     cl_load(memK, K_w4_lds_wr_idx, V_w4_lds_rd_idx);
+                    Scheduler::schedule(cl_p, number<2>{});
                     fmha_mask(xdl_SP_p01_reg_idx);
 
-                    Scheduler::schedule(cl_p, number<2>{});
                     kv_token_start += kN0;
                     if(num_total_loop <= ++i_total_loops)
                     {
@@ -1021,6 +1072,8 @@ struct BlockFmhaFwdV3Pipeline
                     __builtin_amdgcn_sched_barrier(0);
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
+                    asm volatile("s_nop 1");
+                    __builtin_amdgcn_sched_barrier(0);
                     cl_calc(xdl_SP_p23_reg_idx, gemm1);
 
                     Scheduler::schedule(cl_p, number<3>{});
@@ -1036,7 +1089,14 @@ struct BlockFmhaFwdV3Pipeline
             auto ps_pi        = number<1>{} - d;
             auto V_lds_rd_idx = ps_pi;
 
-            s_waitcnt_vmcnt<K_mem_su_ld_insts>();
+            if(1 < num_total_loop)
+            {
+                s_waitcnt_vmcnt<K_mem_su_ld_insts>();
+            }
+            else
+            {
+                s_waitcnt_vmcnt<0>();
+            }
             __builtin_amdgcn_s_barrier();
 
             V_lds_load(V_lds_rd_idx);
@@ -1102,14 +1162,14 @@ struct BlockFmhaFwdV3Pipeline
                 V_mem_load(number<1>{}); // V1
                 K_lds_load(number<1>{}); // K1
 
-                asm volatile("s_setprio 0");
+                __builtin_amdgcn_s_setprio(0);
                 __builtin_amdgcn_s_barrier();
                 while(core_loop(number<0>{}))
                     ;
             }
             if(warp_group_id != 0)
             {
-                asm volatile("s_setprio 1");
+                __builtin_amdgcn_s_setprio(1);
                 __builtin_amdgcn_s_barrier();
                 while(core_loop(number<1>{}))
                     ;
@@ -1167,14 +1227,13 @@ struct BlockFmhaFwdV3Pipeline
               typename KDramBlockWindowTmp,
               typename VDramBlockWindowTmp,
               typename LSEDramBlockWindowTmp>
-    CK_TILE_HOST_DEVICE auto
-    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
-               const KDramBlockWindowTmp& k_dram_block_window_tmp, // N0*K0 tile
-               const VDramBlockWindowTmp& v_dram_block_window_tmp, // N1*K1 tile
-               LSEDramBlockWindowTmp& lse_dram_block_window_tmp,   // M0*1 tile
-               FmhaMask mask,
-               float scale_s,
-               void* smem_ptr) const
+    CK_TILE_DEVICE auto operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
+                                   const KDramBlockWindowTmp& k_dram_block_window_tmp, // N0*K0 tile
+                                   const VDramBlockWindowTmp& v_dram_block_window_tmp, // N1*K1 tile
+                                   LSEDramBlockWindowTmp& lse_dram_block_window_tmp,   // M0*1 tile
+                                   FmhaMask mask,
+                                   float scale_s,
+                                   void* smem_ptr) const
     {
         using namespace ck_tile;
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
index aafe481d2b..b2c1b06955 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
@@ -37,6 +37,7 @@ struct BlockFmhaPipelineQRKSVSAsyncTrload
     using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
     static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once
     static_assert(kQLoadOnce == Policy::QLoadOnce);
+    static constexpr bool kKLoadOnce = BlockFmhaShape::kM0 >= 64;
 
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_default_policy.hpp
index 67ab548dab..050eb48384 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_default_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -264,12 +264,27 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetchDefaultPolicy
 
         constexpr auto warp_gemm = []() {
             constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
-            static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
 
-            if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
-                         std::is_same_v<typename Problem::KDataType, half_t> &&
+            if constexpr(std::is_same_v<typename Problem::QDataType, float> &&
+                         std::is_same_v<typename Problem::KDataType, float> &&
                          std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 16);
+
+                return WarpGemmDispatcher<typename Problem::QDataType,
+                                          typename Problem::KDataType,
+                                          typename Problem::SaccDataType,
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
+                                          true>{};
+            }
+            else if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
+                              std::is_same_v<typename Problem::KDataType, half_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{};
                 else if constexpr(WarpGemmM == 16)
@@ -281,6 +296,8 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetchDefaultPolicy
                               std::is_same_v<typename Problem::KDataType, bf16_t> &&
                               std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{};
                 else if constexpr(WarpGemmM == 16)
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index ff1f31edc8..9dba3c85d5 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -73,12 +73,27 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
 
         constexpr auto warp_gemm = []() {
             constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
-            static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
 
-            if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
-                         std::is_same_v<typename Problem::KDataType, half_t> &&
+            if constexpr(std::is_same_v<typename Problem::QDataType, float> &&
+                         std::is_same_v<typename Problem::KDataType, float> &&
                          std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 16);
+
+                return WarpGemmDispatcher<typename Problem::QDataType,
+                                          typename Problem::KDataType,
+                                          typename Problem::SaccDataType,
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
+                                          true>{};
+            }
+            else if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
+                              std::is_same_v<typename Problem::KDataType, half_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{};
                 else if constexpr(WarpGemmM == 16)
@@ -90,6 +105,8 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
                               std::is_same_v<typename Problem::KDataType, bf16_t> &&
                               std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{};
                 else if constexpr(WarpGemmM == 16)
@@ -201,7 +218,7 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ false>
         constexpr auto q_lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
             make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
-            number<8>{},
+            number<kKPack>{},
             number<1>{});
 
         constexpr auto q_lds_block_desc = transform_tensor_descriptor(
@@ -228,14 +245,29 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ false>
                                            typename Problem::BlockFmhaShape::Gemm0BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
 
-        constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
-        static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
-
         constexpr auto warp_gemm = []() {
-            if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
-                         std::is_same_v<typename Problem::KDataType, half_t> &&
+            constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
+
+            if constexpr(std::is_same_v<typename Problem::QDataType, float> &&
+                         std::is_same_v<typename Problem::KDataType, float> &&
                          std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 16);
+
+                return WarpGemmDispatcher<typename Problem::QDataType,
+                                          typename Problem::KDataType,
+                                          typename Problem::SaccDataType,
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
+                                          Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
+                                          true>{};
+            }
+            else if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
+                              std::is_same_v<typename Problem::KDataType, half_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{};
                 else if constexpr(WarpGemmM == 16)
@@ -247,6 +279,8 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ false>
                               std::is_same_v<typename Problem::KDataType, bf16_t> &&
                               std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{};
                 else if constexpr(WarpGemmM == 16)
@@ -258,6 +292,8 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ false>
                               std::is_same_v<typename Problem::KDataType, fp8_t> &&
                               std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 32);
+
                 // TODO: hard coded here. Otherwise, it may incorrect result
                 constexpr index_t swizzle_factor = 4;
                 return WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution<
@@ -507,7 +543,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr auto k_lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kKPerBlock / kKPack>{}, number<kNPerBlock>{}, number<kKPack>{}),
             make_tuple(number<(kNPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
-            number<8>{},
+            number<kKPack>{},
             number<1>{});
 
         constexpr auto k_lds_block_desc = transform_tensor_descriptor(
@@ -806,14 +842,14 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
             if constexpr(total_pixels % N1 != 0 || kKPack % K3 != 0) // if K2 or K3 is not divisible
             {
-                constexpr index_t kNPack = 32;
-                static_assert(kNPerBlock % kNPack == 0);
-                constexpr index_t K0   = kBlockSize / get_warp_size();
-                constexpr index_t N2   = 2;
-                constexpr index_t N1_m = kNPack / N2;
-                constexpr index_t N0_m = kNPerBlock / kNPack;
-                constexpr index_t K1   = get_warp_size() / N1_m;
-                constexpr index_t K2_m = kKPerBlock / K1;
+                static_assert(kNPerBlock % 16 == 0);
+                constexpr index_t kNPack = kNPerBlock % 32 == 0 ? 32 : 16;
+                constexpr index_t K0     = kBlockSize / get_warp_size();
+                constexpr index_t N2     = 2;
+                constexpr index_t N1_m   = kNPack / N2;
+                constexpr index_t N0_m   = kNPerBlock / kNPack;
+                constexpr index_t K1     = get_warp_size() / N1_m;
+                constexpr index_t K2_m   = kKPerBlock / K1 / K0;
                 return make_static_tile_distribution(
                     tile_distribution_encoding<
                         sequence<1>,
@@ -823,7 +859,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
                         sequence<1, 2, 1>, // N0 K2 N2
                         sequence<0, 2, 2>>{});
             }
-            else if constexpr(get_warp_size() % (kKPack / K3 * N0) == 0)
+            else if constexpr(get_warp_size() % (K2 * N0) == 0)
             {
                 constexpr index_t K1 = get_warp_size() / (K2 * N0);
                 constexpr index_t K0 = kBlockSize / get_warp_size();
@@ -862,13 +898,40 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             constexpr index_t N0 = kNPerBlock / (N2 * N1);
             static_assert(N0 != 0);
 
-            return make_static_tile_distribution(
+            constexpr auto dstr = make_static_tile_distribution(
                 tile_distribution_encoding<sequence<1>,
                                            tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                           tuple<sequence<1>, sequence<1, 2>>,
+                                           tuple<sequence<1>, sequence<1, 2>>, // N1, N2 K0
                                            tuple<sequence<1>, sequence<2, 0>>,
-                                           sequence<1, 2>,
+                                           sequence<1, 2>, // N0 K1
                                            sequence<0, 1>>{});
+            if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                         kNPerBlock * kKPerBlock)
+            {
+                return dstr;
+            }
+            else
+            {
+                static_assert(kKPerBlock % 16 == 0);
+                constexpr index_t kKPerIter = kKPerBlock % 32 == 0 ? 32 : 16;
+                constexpr index_t K0_m      = kKPerBlock / kKPerIter;
+                constexpr index_t K2        = 2;
+                constexpr index_t K1_m      = kKPerIter / K2;
+                constexpr index_t N2_m      = get_warp_size() / K1_m;
+                constexpr index_t N0_m      = kNPerBlock / (N2_m * N1);
+                constexpr auto dstr_m       = make_static_tile_distribution(
+                    tile_distribution_encoding<
+                              sequence<1>,
+                              tuple<sequence<N0_m, N1, N2_m>, sequence<K0_m, K1_m, K2>>,
+                              tuple<sequence<1>, sequence<1, 2>>, // N1, N2 K1
+                              tuple<sequence<1>, sequence<2, 1>>,
+                              sequence<2, 1, 2>, // K0 N0 K2
+                              sequence<0, 0, 2>>{});
+                static_assert(container_reduce(dstr_m.get_lengths(),
+                                               std::multiplies<index_t>{},
+                                               1) == kNPerBlock * kKPerBlock);
+                return dstr_m;
+            }
         }
     }
 
@@ -896,14 +959,14 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
         if constexpr(total_pixels % N1 != 0 || kKPack % K3 != 0) // if K2 or K3 is not divisible
         {
-            constexpr index_t kNPack = 32;
-            static_assert(kNPerBlock % kNPack == 0);
-            constexpr index_t K0   = kBlockSize / get_warp_size();
-            constexpr index_t N2   = 2;
-            constexpr index_t N1_m = kNPack / N2;
-            constexpr index_t N0_m = kNPerBlock / kNPack;
-            constexpr index_t K1   = get_warp_size() / N1_m;
-            constexpr index_t K2_m = kKPerBlock / K1;
+            static_assert(kNPerBlock % 16 == 0);
+            constexpr index_t kNPack = kNPerBlock % 32 == 0 ? 32 : 16;
+            constexpr index_t K0     = kBlockSize / get_warp_size();
+            constexpr index_t N2     = 2;
+            constexpr index_t N1_m   = kNPack / N2;
+            constexpr index_t N0_m   = kNPerBlock / kNPack;
+            constexpr index_t K1     = get_warp_size() / N1_m;
+            constexpr index_t K2_m   = kKPerBlock / K1 / K0;
             return make_static_tile_distribution(
                 tile_distribution_encoding<sequence<1>,
                                            tuple<sequence<N0_m, N1_m, N2>, sequence<K0, K1, K2_m>>,
@@ -912,7 +975,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
                                            sequence<1, 1, 2>, // N0 K2 <-> N2
                                            sequence<0, 2, 2>>{});
         }
-        else if constexpr(get_warp_size() % (kKPack / K3 * N0) == 0)
+        else if constexpr(get_warp_size() % (K2 * N0) == 0)
         {
             constexpr index_t K1 = get_warp_size() / (K2 * N0);
             constexpr index_t K0 = kBlockSize / get_warp_size();
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
index 41a744ea91..ca82519e72 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -7,20 +7,22 @@
 
 namespace ck_tile {
 
-static CK_TILE_HOST_DEVICE constexpr index_t ceil_to_qualified_tile_length(index_t len)
+template <index_t Headdim>
+static CK_TILE_HOST_DEVICE constexpr index_t ceil_to_qualified_tile_length()
 {
-    if(len == 96)
+    if constexpr(Headdim == 48)
+        return 48;
+    else if constexpr(Headdim == 96)
         return 128;
-    if(len == 160)
+    else if constexpr(Headdim == 160)
         return 256;
-    if(len == 192)
+    else if constexpr(Headdim == 192)
         return 192;
-
-    // only length of 96, 160 and power-of-two is supported
-    if(!(len & (len - 1)))
-        return len;
-
-    return 0;
+    else if constexpr(is_power_of_two_integer(Headdim))
+        return Headdim;
+    else
+        static_assert(Headdim == 0,
+                      "only Headdim of 48, 96, 160, 192 and power-of-two is supported");
 };
 
 template <typename BlockTile_, // sequence<...
@@ -55,7 +57,7 @@ struct TileFmhaShape
                                     // once (or repeately load Q as a whole tile)
     static_assert(kQKHeaddim % kK0 == 0, "kQKHeaddim should be divisible by kK0");
 
-    static constexpr index_t kSubQKHeaddim = ceil_to_qualified_tile_length(kQKHeaddim);
+    static constexpr index_t kSubQKHeaddim = ceil_to_qualified_tile_length<kQKHeaddim>();
 
     // v, rowmajor : seqlen*hdim, colmajor : hdim*seqlen
     static constexpr bool IsVLayoutRowMajor = IsVLayoutRowMajor_;
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
index cd3893f5cf..59267fa3b1 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
@@ -37,6 +37,23 @@ struct TileFmhaTraits
     static constexpr bool kSkipMinSeqlenQ   = kSkipMinSeqlenQ_;
 };
 
+template <index_t kPadHeadDimQ_ /* paddding for hdim_q */,
+          index_t kPadHeadDimV_ /* paddding for hdim_v */,
+          BlockAttentionBiasEnum BiasEnum_,
+          bool kHasBiasGrad_,
+          index_t kBlockPerCu_ = -1 /* overwrite occupancy if not -1 */>
+struct TileFmhaBwdTraits
+{
+    static constexpr index_t kPadHeadDimQ = kPadHeadDimQ_;
+    static constexpr index_t kPadHeadDimV = kPadHeadDimV_;
+    static constexpr auto BiasEnum        = BiasEnum_;
+    static constexpr bool kHasBiasGrad    = kHasBiasGrad_;
+    static constexpr index_t kBlockPerCu  = kBlockPerCu_;
+
+    static_assert(kPadHeadDimQ == 0 || kPadHeadDimQ == 8 || kPadHeadDimQ == 1);
+    static_assert(kPadHeadDimV == 0 || kPadHeadDimV == 8 || kPadHeadDimV == 1);
+};
+
 template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
           bool kPadSeqLenK_ /* padding for seqlen_k */,
           bool kPadHeadDimQ_ /* paddding for hdim_q */,
diff --git a/include/ck_tile/ops/fused_moe.hpp b/include/ck_tile/ops/fused_moe.hpp
index ddb64a2189..71721f3408 100644
--- a/include/ck_tile/ops/fused_moe.hpp
+++ b/include/ck_tile/ops/fused_moe.hpp
@@ -16,5 +16,7 @@
 #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp"
 #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
index 6d95decaee..c69c15a2b0 100644
--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
@@ -240,7 +240,7 @@ struct FusedMoeGemmKernel
         if constexpr(UseUK)
         {
             __shared__ CK_TILE_LDS_ADDR char smem[GetSmemSize()];
-            IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
+            IndexDataType num_sorted_tiles = amd_wave_read_first_lane(
                 *reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));
 
             num_sorted_tiles = num_sorted_tiles / BlockShape::Block_M0;
@@ -261,7 +261,7 @@ struct FusedMoeGemmKernel
         {
             // allocate LDS
             // __shared__ char smem_ptr[GetSmemSize()];
-            IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
+            IndexDataType num_sorted_tiles = amd_wave_read_first_lane(
                 *reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));
             constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2;
 
@@ -283,14 +283,14 @@ struct FusedMoeGemmKernel
                 return;
 
             const IndexDataType expert_id =
-                __builtin_amdgcn_readfirstlane(reinterpret_cast<const IndexDataType*>(
+                amd_wave_read_first_lane(reinterpret_cast<const IndexDataType*>(
                     kargs.sorted_expert_ids_ptr)[sorted_tile_id]);
 
             // index along intermediate_size
             // index_t hidden_idx = __builtin_amdgcn_readfirstlane(intermediate_tile_id *
             // BlockShape::Block_N0);
             index_t interm_idx_nr =
-                __builtin_amdgcn_readfirstlane(intermediate_tile_id * BlockShape::Block_Nr0);
+                amd_wave_read_first_lane(intermediate_tile_id * BlockShape::Block_Nr0);
 
             const auto a_coord = Pipeline::GetACoord(); // 2d thread offset, [i_row, i_col]
             const auto sorted_token_id =
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index faeb5cf6b3..42e2fad236 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -756,7 +756,7 @@ struct MoeSortingKernel
                                    void* smem) const
     {
         const index_t tid            = static_cast<index_t>(threadIdx.x);
-        const index_t wid            = __builtin_amdgcn_readfirstlane(tid / get_warp_size());
+        const index_t wid            = amd_wave_read_first_lane(tid / get_warp_size());
         const index_t lid            = __lane_id();
         constexpr index_t block_size = 256;           // blockDim.x;
         const index_t sub_tokens     = smem_rows - 2; // sub_tokens_mdiv.divisor;
@@ -1574,6 +1574,7 @@ struct MoeSortingMultiPhaseKernel_P0
         void* p_expert_mesh;        // [expert, tokens]
         index_t tokens; // if p_local_tokens is not nullptr, this indicate the max possible tokens
                         // used for ws/LDS calculation
+        index_t num_experts;
         index_t mesh_stride; // mesh_stride for p_expert_mesh
         mdiv topk_mdiv;
     };
@@ -1597,6 +1598,7 @@ struct MoeSortingMultiPhaseKernel_P0
         k.p_local_tokens = h.p_local_tokens;
         k.p_expert_mesh  = h.p_ws;
         k.tokens         = h.tokens;
+        k.num_experts    = h.num_experts;
         k.mesh_stride    = impl::moe_sorting_mp_mesh_stride(h.tokens);
         k.topk_mdiv      = mdiv{static_cast<uint32_t>(h.topk)};
         return k;
@@ -1655,14 +1657,18 @@ struct MoeSortingMultiPhaseKernel_P0
                 IndexType eid = x[j.value]; // ext_vector_type must use int to []
                 uint32_t curr_token_id, curr_topk_id;
                 kargs.topk_mdiv.divmod(i * Problem::SubTokenTile + j, curr_token_id, curr_topk_id);
-                if constexpr(Problem::LocalToken)
+                if(eid < kargs.num_experts)
                 {
-                    if(static_cast<index_t>(curr_token_id) < tokens)
+                    if constexpr(Problem::LocalToken)
+                    {
+                        if(static_cast<index_t>(curr_token_id) < tokens)
+                            p_expert_mesh[eid * mesh_stride + curr_token_id] =
+                                (curr_topk_id + 1) & 0xffff;
+                    }
+                    else
                         p_expert_mesh[eid * mesh_stride + curr_token_id] =
                             (curr_topk_id + 1) & 0xffff;
                 }
-                else
-                    p_expert_mesh[eid * mesh_stride + curr_token_id] = (curr_topk_id + 1) & 0xffff;
             });
         }
     }
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
index 38410721ae..3f0dbfb340 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
@@ -184,17 +184,17 @@ struct FusedMoeGemmPipeline_FlatmmUk
         index_t nr_1 = kargs.hidden_size / BlockShape::Warp_N1;
         index_t kr_1 = shared_intermediate_size_1 / BlockShape::Warp_K1;
 
-        const IndexDataType expert_id = __builtin_amdgcn_readfirstlane(
+        const IndexDataType expert_id = amd_wave_read_first_lane(
             reinterpret_cast<const IndexDataType*>(kargs.sorted_expert_ids_ptr)[sorted_tile_id]);
         index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size;
         index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size;
 
         // nr*kr*w
-        index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane(
+        index_t interm_idx_nr0 = amd_wave_read_first_lane(
             intermediate_tile_id *
             BlockShape::Block_Nr0); // intermediate_tile_id * Block_N / (N in W)
 
-        index_t interm_idx_kr1 = __builtin_amdgcn_readfirstlane(
+        index_t interm_idx_kr1 = amd_wave_read_first_lane(
             intermediate_tile_id *
             BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W)
 
@@ -210,7 +210,8 @@ struct FusedMoeGemmPipeline_FlatmmUk
 
         auto a_res =
             make_wave_buffer_resource(reinterpret_cast<const ADataType*>(kargs.a_ptr),
-                                      kargs.num_tokens * kargs.stride_token * sizeof(ADataType));
+                                      kargs.num_tokens * kargs.stride_token * sizeof(ADataType),
+                                      std::true_type{});
 
         auto make_gu_win = [&](const auto* ptr_) {
             auto view_ = make_naive_tensor_view<address_space_enum::global>(
@@ -322,7 +323,8 @@ struct FusedMoeGemmPipeline_FlatmmUk
 
         auto o_res =
             make_wave_buffer_resource(reinterpret_cast<const ODataType*>(kargs.o_ptr),
-                                      kargs.num_tokens * kargs.stride_token * sizeof(ODataType));
+                                      kargs.num_tokens * kargs.stride_token * sizeof(ODataType),
+                                      std::true_type{});
         auto row_coords_o = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0);
         auto w_scale      = GetWeightScale(
             row_coords_o, reinterpret_cast<const TopkWeightDataType*>(kargs.sorted_weight_ptr));
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 097ab2a896..e5f560d834 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -34,6 +34,7 @@
 #include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
@@ -73,5 +74,7 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_ar_br_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_ar_br_cr.hpp
index 7373a4890d..fe1d92e7d2 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_ar_br_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_ar_br_cr.hpp
@@ -14,11 +14,13 @@ namespace ck_tile {
 // A is block distributed tensor
 // B is block distributed tensor
 // C is block distributed tensor
-template <typename Problem_, typename Policy_ = BlockGemmARegBSmemCRegV1DefaultPolicy>
-struct BlockUniversalGemmArBrCr : public BlockUniversalGemmBase<Problem_, Policy_>
+template <typename Problem_,
+          typename Policy_     = BlockGemmARegBSmemCRegV1DefaultPolicy,
+          index_t UnaryOpSize_ = 8>
+struct BlockUniversalGemmArBrCr : public BlockUniversalGemmBase<Problem_, Policy_, UnaryOpSize_>
 {
     private:
-    using Base = BlockUniversalGemmBase<Problem_, Policy_>;
+    using Base = BlockUniversalGemmBase<Problem_, Policy_, UnaryOpSize_>;
     using Base::a_warp_y_index_zeros;
     using Base::a_warp_y_lengths;
     using Base::b_warp_y_index_zeros;
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_ar_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_ar_bs_cr.hpp
index 6331379110..cfa4b8a2f1 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_ar_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_ar_bs_cr.hpp
@@ -14,11 +14,13 @@ namespace ck_tile {
 // A is block distributed tensor
 // B is block window on shared memory
 // C is block distributed tensor
-template <typename Problem_, typename Policy_ = BlockGemmARegBSmemCRegV1DefaultPolicy>
-struct BlockUniversalGemmArBsCr : public BlockUniversalGemmBase<Problem_, Policy_>
+template <typename Problem_,
+          typename Policy_     = BlockGemmARegBSmemCRegV1DefaultPolicy,
+          index_t UnaryOpSize_ = 8>
+struct BlockUniversalGemmArBsCr : public BlockUniversalGemmBase<Problem_, Policy_, UnaryOpSize_>
 {
     private:
-    using Base = BlockUniversalGemmBase<Problem_, Policy_>;
+    using Base = BlockUniversalGemmBase<Problem_, Policy_, UnaryOpSize_>;
     using Base::a_warp_y_index_zeros;
     using Base::a_warp_y_lengths;
     using Base::b_warp_y_index_zeros;
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_br_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_br_cr.hpp
index 2b40109d15..62ca99c596 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_br_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_br_cr.hpp
@@ -14,11 +14,13 @@ namespace ck_tile {
 // A is block window on shared memory
 // B is block distributed tensor
 // C is block distributed tensor
-template <typename Problem_, typename Policy_ = BlockGemmASmemBRegCRegV1DefaultPolicy>
-struct BlockUniversalGemmAsBrCr : public BlockUniversalGemmBase<Problem_, Policy_>
+template <typename Problem_,
+          typename Policy_     = BlockGemmASmemBRegCRegV1DefaultPolicy,
+          index_t UnaryOpSize_ = 8>
+struct BlockUniversalGemmAsBrCr : public BlockUniversalGemmBase<Problem_, Policy_, UnaryOpSize_>
 {
     private:
-    using Base = BlockUniversalGemmBase<Problem_, Policy_>;
+    using Base = BlockUniversalGemmBase<Problem_, Policy_, UnaryOpSize_>;
     using Base::a_warp_y_index_zeros;
     using Base::a_warp_y_lengths;
     using Base::b_warp_y_index_zeros;
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index 34b4d77dbc..144839f847 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/elementwise.hpp"
@@ -14,11 +15,13 @@ namespace ck_tile {
 // A is block window on shared memory
 // B is block window on shared memory
 // C is block distributed tensor
-template <typename Problem_, typename Policy_ = BlockGemmASmemBSmemCRegV1DefaultPolicy>
-struct BlockUniversalGemmAsBsCr : public BlockUniversalGemmBase<Problem_, Policy_>
+template <typename Problem_,
+          typename Policy_     = BlockGemmASmemBSmemCRegV1DefaultPolicy,
+          index_t UnaryOpSize_ = 8>
+struct BlockUniversalGemmAsBsCr : public BlockUniversalGemmBase<Problem_, Policy_, UnaryOpSize_>
 {
     private:
-    using Base = BlockUniversalGemmBase<Problem_, Policy_>;
+    using Base = BlockUniversalGemmBase<Problem_, Policy_, UnaryOpSize_>;
     using Base::a_warp_y_index_zeros;
     using Base::a_warp_y_lengths;
     using Base::b_warp_y_index_zeros;
@@ -88,7 +91,7 @@ struct BlockUniversalGemmAsBsCr : public BlockUniversalGemmBase<Problem_, Policy
 
             if constexpr(std::is_same_v<ADataType, pk_int4_t>)
             {
-                load_interleaved_pk_type(a_warp_tile_, a_block_window);
+                Base::Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
             }
             else
             {
@@ -96,7 +99,7 @@ struct BlockUniversalGemmAsBsCr : public BlockUniversalGemmBase<Problem_, Policy
             }
             if constexpr(std::is_same_v<BDataType, pk_int4_t>)
             {
-                load_interleaved_pk_type(b_warp_tile_, b_block_window);
+                Base::Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
             }
             else
             {
@@ -174,7 +177,7 @@ struct BlockUniversalGemmAsBsCr : public BlockUniversalGemmBase<Problem_, Policy
         {
             if constexpr(std::is_same_v<ADataType, pk_int4_t>)
             {
-                load_interleaved_pk_type(a_warp_tile_, a_block_window);
+                Base::Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
             }
             else if constexpr(ALoadTranspose)
             {
@@ -192,7 +195,7 @@ struct BlockUniversalGemmAsBsCr : public BlockUniversalGemmBase<Problem_, Policy
         {
             if constexpr(std::is_same_v<BDataType, pk_int4_t>)
             {
-                load_interleaved_pk_type(b_warp_tile_, b_block_window);
+                Base::Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
             }
             else if constexpr(BLoadTranspose)
             {
@@ -309,7 +312,7 @@ struct BlockUniversalGemmAsBsCr : public BlockUniversalGemmBase<Problem_, Policy
 
             if constexpr(std::is_same_v<ADataType, pk_int4_t>)
             {
-                load_interleaved_pk_type(a_warp_tile_, a_block_window);
+                Base::Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
             }
             else if constexpr(ALoadTranspose)
             {
@@ -356,7 +359,7 @@ struct BlockUniversalGemmAsBsCr : public BlockUniversalGemmBase<Problem_, Policy
 
             if constexpr(std::is_same_v<BDataType, pk_int4_t>)
             {
-                load_interleaved_pk_type(b_warp_tile_, b_block_window);
+                Base::Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
             }
             else if constexpr(BLoadTranspose)
             {
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_base.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_base.hpp
index 390a41af2f..85256679ae 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_base.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_base.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
@@ -11,7 +12,7 @@
 
 namespace ck_tile {
 
-template <typename Problem_, typename Policy_>
+template <typename Problem_, typename Policy_, index_t UnaryOpSize_>
 struct BlockUniversalGemmBase
 {
     protected:
@@ -81,7 +82,6 @@ struct BlockUniversalGemmBase
     };
 
     public:
-    // using Traits = UniversalGemmPolicyBase<Problem_, Policy_>::GemmTraits;
     using Traits = GemmTraits_<Problem_, Policy_>;
 
     using ADataType       = remove_cvref_t<typename Traits::ADataType>;
@@ -99,6 +99,7 @@ struct BlockUniversalGemmBase
     static constexpr index_t NWarp = Traits::NWarp;
 
     static constexpr auto Scheduler = Traits::Scheduler;
+    using Loader = remove_cvref_t<InterleavedPKTypeLoader<ComputeDataType, UnaryOpSize_>>;
 
     using AWarpDstr = typename WarpGemm::AWarpDstr;
     using BWarpDstr = typename WarpGemm::BWarpDstr;
diff --git a/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp
index f4659c44fe..9036d48b08 100644
--- a/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp
@@ -33,15 +33,14 @@ struct BlockWeightPreshuffleASmemBSmemCRegV1
 
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
+    static constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+    using WG                     = remove_cvref_t<decltype(config.template at<0>())>;
+
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
         constexpr index_t MPerBlock = BlockGemmShape::kM;
         constexpr index_t NPerBlock = BlockGemmShape::kN;
 
-        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
-
-        using WG = remove_cvref_t<decltype(config.template at<0>())>;
-
         constexpr index_t MWarp = config.template at<1>();
         constexpr index_t NWarp = config.template at<2>();
 
@@ -74,9 +73,6 @@ struct BlockWeightPreshuffleASmemBSmemCRegV1
         constexpr index_t MPerBlock = BlockGemmShape::kM;
         constexpr index_t KPerBlock = BlockGemmShape::kK;
 
-        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
-        using WG              = remove_cvref_t<decltype(config.template at<0>())>;
-
         constexpr index_t MWarp = config.template at<1>();
 
         constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index fcfbf9635f..6f9d53467f 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -90,10 +90,10 @@ struct BatchedGemmKernel
         !is_detected<is_tuple, BLayout>::value && !is_detected<is_tuple, BDataType>::value,
         "BLayout and BDataType must be scalars. Multiple parameters are not currently supported.");
 
-    /// @brief  C/ELayout and C/EDataType are expected to be scalars, not a tuple.
+    /// @brief  C/CLayout and C/EDataType are expected to be scalars, not a tuple.
     static_assert(!is_detected<is_tuple, CLayout>::value &&
                       !is_detected<is_tuple, CDataType>::value,
-                  "C/ELayout and C/EDataType must be scalars.");
+                  "C/CLayout and C/EDataType must be scalars.");
 
     struct BatchedGemmKernelArgs : ck_tile::UniversalGemmKernelArgs<>
     {
@@ -169,27 +169,27 @@ struct BatchedGemmKernel
     CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const
     {
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
-        const auto i_batch  = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const auto i_splitk = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const auto i_batch  = amd_wave_read_first_lane(blockIdx.y);
+        const auto i_splitk = amd_wave_read_first_lane(blockIdx.z);
 
         const typename UniversalGemmKernel::SplitKBatchOffset splitk_batch_offset(kargs, i_splitk);
 
         //  options
-        const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
-        const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
+        const auto batch_stride_A = amd_wave_read_first_lane(kargs.batch_stride_A);
+        const auto batch_offset_A = amd_wave_read_first_lane(i_batch * batch_stride_A);
         const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) + batch_offset_A +
                                  splitk_batch_offset.as_k_split_offset[0];
 
-        const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
-        const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
+        const auto batch_stride_B = amd_wave_read_first_lane(kargs.batch_stride_B);
+        const auto batch_offset_B = amd_wave_read_first_lane(i_batch * batch_stride_B);
         const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) + batch_offset_B +
                                  splitk_batch_offset.bs_k_split_offset[0];
 
-        const auto batch_stride_E = __builtin_amdgcn_readfirstlane(kargs.batch_stride_E);
-        const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_E);
+        const auto batch_stride_E = amd_wave_read_first_lane(kargs.batch_stride_E);
+        const auto batch_offset_C = amd_wave_read_first_lane(i_batch * batch_stride_E);
         CDataType* c_ptr          = static_cast<CDataType*>(kargs.e_ptr) + batch_offset_C;
 
         // allocate LDS
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index e37b4f36d4..d632b1596c 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -89,7 +89,7 @@ struct GemmKernel
     /// @brief Specify the layout configurations for A, B, E and D
     using ALayout = remove_cvref_t<typename GemmPipeline::ALayout>;
     using BLayout = remove_cvref_t<typename GemmPipeline::BLayout>;
-    using ELayout = remove_cvref_t<typename GemmPipeline::CLayout>;
+    using CLayout = remove_cvref_t<typename GemmPipeline::CLayout>;
 
     /// @brief  Specify the data type configurations for A, B, E and D
     using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
@@ -106,10 +106,10 @@ struct GemmKernel
         !is_detected<is_tuple, BLayout>::value && !is_detected<is_tuple, BDataType>::value,
         "BLayout and BDataType must be scalars. Multiple parameters are not currently supported.");
 
-    /// @brief  C/ELayout and C/EDataType are expected to be scalars, not a tuple.
-    static_assert(!is_detected<is_tuple, ELayout>::value &&
+    /// @brief  C/CLayout and C/EDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, CLayout>::value &&
                       !is_detected<is_tuple, EDataType>::value,
-                  "C/ELayout and C/EDataType must be scalars.");
+                  "C/CLayout and C/EDataType must be scalars.");
 
     static constexpr index_t NumATensor = 1;
     static constexpr index_t NumBTensor = 1;
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp
new file mode 100644
index 0000000000..b4ddc33e8d
--- /dev/null
+++ b/include/ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/host/stream_utils.hpp"
+#include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+/// @brief The MultiABD GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref GemmKernelMultiABD "GemmKernelMultiABD" when creating
+///      kernel arguments object. It contain all necessary information required to build proper
+///      kernel argument and launch kernel on GPU. This structure defines the GEMM problem
+///      configuration by stating all required information like M,N,K sizes and respective strides.
+///      NumATensor describes the number of A tensors. The minimum number of tensors is 1(required).
+///      NumBTensor describes the number of B tensors. The minimum number of tensors is 1(required).
+///      NumDTensor describes the number of D tensors. The minimum number of tensors is 0(not
+///      required).
+template <index_t NumATensor, index_t NumBTensor, index_t NumDTensor>
+struct GemmMultiABDHostArgs
+{
+    CK_TILE_HOST GemmMultiABDHostArgs(const std::array<const void*, NumATensor>& as_ptr_,
+                                      const std::array<const void*, NumBTensor>& bs_ptr_,
+                                      const std::array<const void*, NumDTensor>& ds_ptr_,
+                                      void* e_ptr_,
+                                      index_t k_batch_,
+                                      index_t M_,
+                                      index_t N_,
+                                      index_t K_,
+                                      const std::array<index_t, NumATensor>& stride_As_,
+                                      const std::array<index_t, NumBTensor>& stride_Bs_,
+                                      const std::array<index_t, NumDTensor>& stride_Ds_,
+                                      index_t stride_E_)
+        : as_ptr(as_ptr_),
+          bs_ptr(bs_ptr_),
+          ds_ptr(ds_ptr_),
+          e_ptr(e_ptr_),
+          M(M_),
+          N(N_),
+          K(K_),
+          stride_As(stride_As_),
+          stride_Bs(stride_Bs_),
+          stride_Ds(stride_Ds_),
+          stride_E(stride_E_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const std::array<const void*, NumATensor> as_ptr;
+    const std::array<const void*, NumBTensor> bs_ptr;
+    const std::array<const void*, NumDTensor> ds_ptr;
+    union
+    {
+        void* e_ptr;
+        void* c_ptr;
+    };
+    index_t M;
+    index_t N;
+    index_t K;
+    const std::array<index_t, NumATensor> stride_As;
+    const std::array<index_t, NumBTensor> stride_Bs;
+    const std::array<index_t, NumDTensor> stride_Ds;
+    union
+    {
+        index_t stride_E;
+        index_t stride_C;
+    };
+
+    index_t k_batch;
+};
+
+template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
+struct GemmKernelMultiABD
+{
+    /// @brief Inject the UniversalGemmKernel base class to support execution of all necessary
+    /// functions.
+    using UniversalGemmKernel =
+        UniversalGemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
+    static constexpr index_t kBlockSize = UniversalGemmKernel::kBlockSize;
+
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+
+    /// @brief  Specify the layout configurations for A, B, E and D
+    using AsLayout = remove_cvref_t<typename GemmPipeline::AsLayout>;
+    using BsLayout = remove_cvref_t<typename GemmPipeline::BsLayout>;
+    using CLayout  = remove_cvref_t<typename GemmPipeline::CLayout>;
+    using DsLayout = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
+
+    /// @brief  Specify the data type configurations for A, B, E and D
+    using AsDataType = remove_cvref_t<typename GemmPipeline::AsDataType>;
+    using BsDataType = remove_cvref_t<typename GemmPipeline::BsDataType>;
+    using EDataType  = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using DsDataType = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
+
+    /// @brief  ALayout and ADataType are expected to be a tuple, not a scalar.
+    static_assert(is_detected<is_tuple, AsLayout>::value &&
+                      is_detected<is_tuple, AsDataType>::value,
+                  "ALayout and ADataType must be a tuple.");
+
+    /// @brief  BLayout and BDataType are expected to be a tuple, not a scalar.
+    static_assert(is_detected<is_tuple, BsLayout>::value &&
+                      is_detected<is_tuple, BsDataType>::value,
+                  "BLayout and BDataType must be a tuple.");
+
+    /// @brief  CLayout and EDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, CLayout>::value &&
+                      !is_detected<is_tuple, EDataType>::value,
+                  "CLayout and EDataType must be a scalar.");
+
+    /// @brief  DsLayout and DsDataType are expected to be tuple, not a scalar.
+    static_assert(is_detected<is_tuple, DsLayout>::value &&
+                      is_detected<is_tuple, DsDataType>::value &&
+                      DsLayout::size() == DsDataType::size() && DsLayout::size() > 0,
+                  "DsLayout and DsDataType must be tuples and must have the same size.");
+
+    /// @brief The sizes of NumATensor, NumBTensor and NumDTensor is set by the user."
+    static constexpr index_t NumATensor = AsDataType::size();
+    static constexpr index_t NumBTensor = BsDataType::size();
+    static constexpr index_t NumDTensor = DsDataType::size();
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
+    using DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>;
+
+    CK_TILE_HOST static auto GetName() -> const std::string
+    {
+        return UniversalGemmKernel::GetName();
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch) -> dim3
+    {
+        return UniversalGemmKernel::GridSize(M, N, KBatch);
+    }
+
+    CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
+    {
+        return UniversalGemmKernel::MaxOccupancyGridSize(s);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() -> dim3
+    {
+        return UniversalGemmKernel::BlockSize();
+    }
+
+    CK_TILE_HOST static constexpr auto
+    MakeKernelArgs(const GemmMultiABDHostArgs<NumATensor, NumBTensor, NumDTensor>& hostArgs) ->
+        typename UniversalGemmKernel::KernelArgs
+    {
+        /// @brief  Universal GEMM requires array objects and corresponding stride information for
+        /// matrices A, B, and D.
+        return UniversalGemmKernel::MakeKernelArgs(
+            UniversalGemmHostArgs<NumATensor, NumBTensor, NumDTensor>(hostArgs.as_ptr,
+                                                                      hostArgs.bs_ptr,
+                                                                      hostArgs.ds_ptr,
+                                                                      hostArgs.e_ptr,
+                                                                      hostArgs.k_batch,
+                                                                      hostArgs.M,
+                                                                      hostArgs.N,
+                                                                      hostArgs.K,
+                                                                      hostArgs.stride_As,
+                                                                      hostArgs.stride_Bs,
+                                                                      hostArgs.stride_Ds,
+                                                                      hostArgs.stride_E));
+    }
+
+    CK_TILE_HOST static auto
+    IsSupportedArgument(const typename UniversalGemmKernel::KernelArgs& kargs) -> bool
+    {
+        // Currently MultiABD kernel doesn't support k_batch > 1
+        if(kargs.k_batch > 1)
+        {
+            return false;
+        }
+        // Currently MultiABD kernel doesn't support F8 data type
+        if(ck_tile::get_device_name() == "gfx950" &&
+           (std::is_same<ck_tile::fp8_t, ADataType>::value ||
+            std::is_same<ck_tile::fp8_t, BDataType>::value ||
+            std::is_same<ck_tile::fp8_t, DDataType>::value))
+        {
+            return false;
+        }
+
+        return UniversalGemmKernel::IsSupportedArgument(kargs);
+    }
+
+    CK_TILE_DEVICE auto operator()(typename UniversalGemmKernel::KernelArgs kargs) const -> void
+    {
+        UniversalGemmKernel{}.template operator()(kargs);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp
index 9d3ac8b901..b0b2905cb4 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp
@@ -95,7 +95,7 @@ struct GemmKernelMultiD
     /// @brief  Specify the layout configurations for A, B, E and D
     using ALayout  = remove_cvref_t<typename GemmPipeline::ALayout>;
     using BLayout  = remove_cvref_t<typename GemmPipeline::BLayout>;
-    using ELayout  = remove_cvref_t<typename GemmPipeline::CLayout>;
+    using CLayout  = remove_cvref_t<typename GemmPipeline::CLayout>;
     using DsLayout = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
 
     /// @brief  Specify the data type configurations for A, B, E and D
@@ -114,10 +114,10 @@ struct GemmKernelMultiD
                       !is_detected<is_tuple, BDataType>::value,
                   "BLayout and BDataType must be scalars.");
 
-    /// @brief  ELayout and EDataType are expected to be scalars, not a tuple.
-    static_assert(!is_detected<is_tuple, ELayout>::value &&
+    /// @brief  CLayout and EDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, CLayout>::value &&
                       !is_detected<is_tuple, EDataType>::value,
-                  "ELayout and EDataType must be scalars.");
+                  "CLayout and EDataType must be scalars.");
 
     /// @brief  DsLayout and DsDataType are expected to be tuple, not a scalar.
     static_assert(is_detected<is_tuple, DsLayout>::value &&
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index b621468e92..673f5abc34 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
 
 namespace ck_tile {
 
@@ -72,8 +73,8 @@ struct GemmTile2DPartitioner
     CK_TILE_DEVICE static auto
     GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept -> const tuple<index_t, index_t>
     {
-        const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx);
-        const index_t iN = __builtin_amdgcn_readfirstlane(blockIdy);
+        const index_t iM = amd_wave_read_first_lane(blockIdx);
+        const index_t iN = amd_wave_read_first_lane(blockIdy);
         return make_tuple(iM, iN);
     }
 };
@@ -142,8 +143,8 @@ struct GemmTile1DPartitioner
     {
         const index_t NBlocks = integer_divide_ceil(N_, NPerBlock);
 
-        const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx / NBlocks);
-        const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx - iM * NBlocks);
+        const index_t iM = amd_wave_read_first_lane(blockIdx / NBlocks);
+        const index_t iN = amd_wave_read_first_lane(blockIdx - iM * NBlocks);
         return make_tuple(iM, iN);
     }
 
@@ -364,4 +365,449 @@ struct GemmSpatiallyLocalTilePartitioner
     index_t N;
 };
 
+/**
+ * @brief Stream-K tile partitioner that dynamically balances work across workgroups
+ *
+ * This partitioner is responsible for mapping workgroups to tiles in the C tensor
+ * for the Stream-K algorithm which decomposes the GEMM problem
+ * into smaller work units and distributes them more evenly across available blocks,
+ * improving load balancing especially for cases where the K dimension is large.
+ *
+ *  @tparam BlockGemmShapeType  A class providing basic GEMM parameters.
+ *  @tparam ReductionStrategy  A class that defines the reduction strategy for the results in
+ *  the C Tensor.
+ *  @tparam TileSwizzleSubM  A value that defines the size of the swizzle group along the m
+ *  dimension, where the swizzle group denotes consecutive tiles down a column. For instance a
+ *  swizzle group of 8 denotes tiles 0, 1, ..., 7, map to tiles [0,0], [1,0], ..., [7,0] in the C
+ *  tensor.
+ */
+template <typename BlockGemmShapeType,
+          StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic,
+          uint32_t TileSwizzleSubM                   = 8>
+struct StreamKTilePartitioner
+{
+    using BlockGemmShape = BlockGemmShapeType;
+
+    static constexpr uint32_t MPerBlock = BlockGemmShape::kM;
+    static constexpr uint32_t NPerBlock = BlockGemmShape::kN;
+    static constexpr uint32_t KPerBlock = BlockGemmShape::kK;
+
+    CK_TILE_HOST_DEVICE StreamKTilePartitioner() noexcept = delete;
+
+    /**
+     * @brief Construct Stream-K tile partitioner with problem dimensions
+     */
+    CK_TILE_HOST_DEVICE StreamKTilePartitioner(uint32_t M,
+                                               uint32_t N,
+                                               uint32_t K,
+                                               uint32_t num_cu,
+                                               uint32_t occupancy,
+                                               uint32_t sk_blocks = 0xffffffff) noexcept
+        : M_(M), N_(N), K_(K)
+    {
+        num_tile_m_ = integer_divide_ceil(M, MPerBlock);
+        num_tile_n_ = integer_divide_ceil(N, NPerBlock);
+        num_tile_k_ = integer_divide_ceil(K, KPerBlock);
+
+        constexpr uint32_t min_k_iters_per_sk_block = 2;
+        uint32_t num_tiles                          = num_tile_m_ * num_tile_n_;
+        k_iters_per_tile                            = mdiv(num_tile_k_);
+
+        // one cu can hold one wg at one time, from the whole cZ's point of view
+        // if number of wg is same as num_cu, we call it 1 dispatch
+        // if number of wg is 2x num_cu, we call it 2 dispatches.
+        // one dispatch can deliver wg same as num_cu (full dispatch), or less than num_cu (partial
+        // dispatch)
+        //
+        const uint32_t full_dispatches        = num_tiles / num_cu;
+        const uint32_t full_dispatch_tiles    = full_dispatches * num_cu;
+        const uint32_t partial_dispatch_tiles = num_tiles - full_dispatch_tiles;
+
+        uint32_t sk_occupancy = occupancy;
+        uint32_t dp_tiles     = full_dispatch_tiles;
+        uint32_t sk_tiles     = partial_dispatch_tiles;
+
+        if(full_dispatches < occupancy)
+        {
+            // in this case, we allocate all blocks as sk blocks
+            // sk_occupancy = occupancy - full_dispatches;
+            sk_occupancy = 1;
+            dp_tiles     = full_dispatch_tiles;
+            sk_tiles     = partial_dispatch_tiles;
+        }
+        else if((occupancy > 1) && (full_dispatches % occupancy == occupancy - 1))
+        {
+            // e.g. occupancy = 2, full_dispatches = 3, 5, 7 ...
+            //      occupancy = 3, full_dispatches = 5, 8, 11 ...
+            //      occupancy = 4, full_dispatches = 7, 11 ...
+            sk_occupancy = 1; // left 1 slot for sk occupancy
+            dp_tiles     = full_dispatch_tiles;
+            sk_tiles     = partial_dispatch_tiles;
+        }
+        else
+        {
+            // otherwise, we reduce 1 dispatch from dp, together with partial dispatch,
+            // to construct sk dispatch
+            sk_occupancy = occupancy - ((full_dispatches - 1) % occupancy);
+            dp_tiles     = full_dispatch_tiles - num_cu;
+            sk_tiles     = partial_dispatch_tiles + num_cu;
+        }
+
+        // uint32_t dp_iters_per_block = k_iters_per_tile.get();
+        uint32_t sk_total_iters = k_iters_per_tile.get() * sk_tiles;
+        uint32_t dp_num_blocks  = 0;
+
+        {
+            const uint32_t min_sk_tiles = (sk_tiles >= num_cu) ? num_cu : (sk_tiles + 1);
+            const uint32_t max_sk_tiles =
+                (sk_tiles >= num_cu) ? num_cu * sk_occupancy
+                                     : min(num_cu, sk_total_iters / min_k_iters_per_sk_block);
+
+            // if use dp for sk-block, how many iters do we need
+            const uint32_t dp_for_sk_iters = k_iters_per_tile.get();
+
+            uint32_t best_sk_score =
+                std::numeric_limits<int>::max(); // we need to find the smallest sk iters
+            for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
+                tentative_sk_blocks++)
+            {
+                const uint32_t tentative_sk_iters_per_block =
+                    (sk_total_iters + tentative_sk_blocks - 1) / tentative_sk_blocks;
+                const uint32_t tentative_sk_iters = tentative_sk_iters_per_block;
+                const uint32_t sk_blocks_per_tile = (tentative_sk_blocks + sk_tiles - 1) / sk_tiles;
+
+                //       the more sk_blocks_per_tile, the worse the overhead
+                uint32_t cross_sk_blocks_overhead = sk_blocks_per_tile;
+                if(tentative_sk_blocks % sk_tiles != 0)
+                {
+                    // penalty for uneven divide
+                    cross_sk_blocks_overhead +=
+                        sk_blocks_per_tile * tentative_sk_iters_per_block / 50;
+                }
+
+                const uint32_t tentative_sk_score = tentative_sk_iters + cross_sk_blocks_overhead;
+
+                if(tentative_sk_score < best_sk_score)
+                {
+                    best_sk_score = tentative_sk_score;
+                    sk_num_blocks = tentative_sk_blocks;
+                }
+            }
+
+            if(best_sk_score >= dp_for_sk_iters)
+            {
+                sk_num_blocks = 0;
+            }
+
+            // give a chance to control num of sk blocks
+            sk_num_blocks = sk_blocks != 0xffffffff ? sk_blocks : sk_num_blocks;
+
+            if(sk_num_blocks == 0)
+            {
+                sk_num_big_blocks     = 0;
+                k_iters_per_big_block = 0;
+
+                dp_num_blocks      = num_tiles; // all tile to be dp block
+                dp_start_block_idx = 0;
+                sk_total_iters     = 0; // clear this tiles
+            }
+            else
+            {
+                // k_iters_per_sk_block is the floor of avg each ck block loop over tiles.
+                // we need to decide how many iters for each sk block
+                // let m = k_iters_per_sk_block
+                // some of the sk block (little) will cover m iters, some (big) will cover m+1
+                // we have
+                // 1) l + b = sk_blocks
+                // 2) l * m + b * (m + 1) = sk_total_iters
+                //      => (l + b) * m + b = sk_total_iters
+                //      => sk_blocks * m + b = sk_total_iters
+                //      => b = sk_total_iters - m * sk_blocks
+                //      NOTE: big could be zero
+                const uint32_t k_iters_per_sk_block = sk_total_iters / sk_num_blocks;
+                sk_num_big_blocks     = sk_total_iters - k_iters_per_sk_block * sk_num_blocks;
+                k_iters_per_big_block = k_iters_per_sk_block + 1;
+
+                dp_num_blocks      = dp_tiles;
+                dp_start_block_idx = (sk_num_blocks + num_cu - 1) / num_cu * num_cu;
+            }
+        }
+        n_tiles                   = mdiv2(num_tile_n_);
+        reduction_start_block_idx = dp_start_block_idx + dp_num_blocks;
+
+        if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction)
+        {
+            const uint32_t upper_big    = lcm(k_iters_per_big_block, k_iters_per_tile.get());
+            const uint32_t upper_little = lcm(k_iters_per_big_block - 1, k_iters_per_tile.get());
+            equiv_tiles_big             = mdiv(upper_big / k_iters_per_tile.get());
+            equiv_tiles_little          = mdiv(upper_little / k_iters_per_tile.get());
+        }
+    }
+
+    /**
+     * @brief Calculate optimal grid size for Stream-K
+     */
+    CK_TILE_HOST auto GridSize() const noexcept -> dim3
+    {
+        if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction)
+        {
+            return dim3(reduction_start_block_idx + GetSkTiles(), 1, 1);
+        }
+        else
+            return dim3(reduction_start_block_idx, 1, 1);
+    }
+
+    /**
+     * @brief Calculate number of loop iterations over K dimension for given work unit
+     */
+    CK_TILE_HOST_DEVICE static auto GetLoopNum(uint32_t K) noexcept -> uint32_t
+    {
+        return integer_divide_ceil(K, KPerBlock); // Stream-K processes one K-slice at a time
+    }
+
+    /**
+     * @brief Get output tile index for standard 2D mapping (compatibility)
+     */
+    CK_TILE_DEVICE auto
+    GetOutputTileIndex(uint32_t tile_idx) const noexcept -> tuple<uint32_t, uint32_t>
+    {
+        uint32_t m_tile_idx, n_tile_idx;
+        n_tiles.divmod(tile_idx, num_tile_n_, m_tile_idx, n_tile_idx);
+
+        // swizzle tile
+
+        uint32_t tile_swizzle_sub_m_rem = num_tile_m_ % TileSwizzleSubM;
+
+        const auto sub_m_adapt = (m_tile_idx < (num_tile_m_ - tile_swizzle_sub_m_rem))
+                                     ? TileSwizzleSubM
+                                     : tile_swizzle_sub_m_rem;
+
+        uint32_t m_tile_idx_sub0, m_tile_idx_sub1;
+        m_tile_idx_sub0 = m_tile_idx / TileSwizzleSubM;
+        m_tile_idx_sub1 = m_tile_idx % TileSwizzleSubM;
+
+        uint32_t tile_idx_local = n_tile_idx + m_tile_idx_sub1 * num_tile_n_;
+
+        uint32_t m_tile_idx_with_adapt, n_tile_idx_with_adapt;
+
+        n_tile_idx_with_adapt = tile_idx_local / sub_m_adapt;
+        m_tile_idx_with_adapt = tile_idx_local % sub_m_adapt;
+        return make_tuple(m_tile_idx_with_adapt + m_tile_idx_sub0 * TileSwizzleSubM,
+                          n_tile_idx_with_adapt);
+    }
+
+    /**
+     * @brief Get work range for a given block ID
+     */
+    CK_TILE_DEVICE void
+    GetBlockItr(uint32_t block_idx, uint32_t& iter_start, uint32_t& iter_end) const noexcept
+    {
+        if(block_idx < sk_num_big_blocks)
+        {
+            iter_start = block_idx * k_iters_per_big_block;
+            iter_end   = iter_start + k_iters_per_big_block;
+        }
+        else if(block_idx < sk_num_blocks)
+        {
+            iter_start = (sk_num_big_blocks * k_iters_per_big_block) +
+                         (block_idx - sk_num_big_blocks) * (k_iters_per_big_block - 1);
+            iter_end = iter_start + (k_iters_per_big_block - 1);
+        }
+        else if(block_idx >= dp_start_block_idx)
+        {
+            uint32_t sk_total_iters     = GetSkTotalIters();
+            uint32_t dp_iters_per_block = k_iters_per_tile.get();
+            iter_start = sk_total_iters + (block_idx - dp_start_block_idx) * dp_iters_per_block;
+            iter_end   = iter_start + dp_iters_per_block;
+        }
+    }
+
+    /**
+     * @brief Get total number of iterations for sk tiles
+     */
+    CK_TILE_HOST_DEVICE uint32_t GetSkTotalIters() const noexcept
+    {
+        uint32_t sk_total_iters = sk_num_big_blocks * k_iters_per_big_block +
+                                  (sk_num_blocks - sk_num_big_blocks) * (k_iters_per_big_block - 1);
+        return sk_total_iters;
+    }
+
+    /**
+     * @brief Get total number of sk tiles
+     */
+    CK_TILE_HOST_DEVICE uint32_t GetSkTiles() const noexcept
+    {
+        // tiles for sk
+        uint32_t sk_total_iters = GetSkTotalIters();
+        return k_iters_per_tile.div(sk_total_iters);
+    }
+
+    /**
+     * @brief Get length of loop iterations for stream-k loop
+     */
+    CK_TILE_DEVICE uint32_t GetCurrentIterLength(uint32_t iter_start,
+                                                 uint32_t iter_end) const noexcept
+    {
+        // A WG's iter_end is either in the current C macro tile or not.
+        // If it is not, then the macro tile boundary is where the WG must stop.
+        uint32_t distance_to_tile_boundary =
+            k_iters_per_tile.get() - (iter_start % k_iters_per_tile.get());
+        return min(iter_start + distance_to_tile_boundary, iter_end) - iter_start;
+    }
+
+    /**
+     * @brief Get index of tile during a specified iteration
+     */
+    CK_TILE_DEVICE uint32_t GetTileIdx(uint32_t iter) const noexcept
+    {
+        return k_iters_per_tile.div(iter);
+    }
+
+    /**
+     * @brief Get index of tile during a specified iteration
+     */
+    CK_TILE_DEVICE void
+    GetTileIdxWithOffset(uint32_t iter, uint32_t& tile_idx, uint32_t& iter_offset) const noexcept
+    {
+        k_iters_per_tile.divmod(iter, tile_idx, iter_offset);
+    }
+
+    /**
+     * @brief Calculates the buffer space needed for accumulation
+     */
+    CK_TILE_HOST_DEVICE uint32_t GetWorkSpaceSizeForAcc(uint32_t acc_element_bytes) const noexcept
+    {
+        static constexpr uint32_t alignment = 128;
+        uint32_t acc_buffer_bytes =
+            MPerBlock * NPerBlock * GetTotalAccBuffers() * acc_element_bytes;
+        return (acc_buffer_bytes + alignment - 1) / alignment * alignment;
+    }
+
+    /**
+     * @brief Calculates the buffer space needed for the semaphore
+     */
+    CK_TILE_HOST_DEVICE uint32_t GetWorkSpaceSizeForSemaphore() const noexcept
+    {
+        return GetSkTiles() * sizeof(uint32_t);
+    }
+
+    /**
+     * @brief Calculates the total buffer space needed for accumulation and the semaphore
+     */
+    CK_TILE_HOST_DEVICE uint32_t GetWorkSpaceSize(uint32_t acc_element_bytes) const noexcept
+    {
+        return GetWorkSpaceSizeForAcc(acc_element_bytes) + GetWorkSpaceSizeForSemaphore();
+    }
+
+    /**
+     * @brief Get location of intersection of tiles for reduction
+     */
+    CK_TILE_HOST_DEVICE uint32_t GetTileIntersections(uint32_t tiles_,
+                                                      const mdiv& equiv_tiles_) const noexcept
+    {
+        uint32_t tile_idx_        = tiles_ == 0 ? 0 : (tiles_ - 1);
+        uint32_t max_equiv_tiles_ = equiv_tiles_.get() - 1;
+        uint32_t quo_, rem_;
+        equiv_tiles_.divmod(tile_idx_, quo_, rem_);
+        return quo_ * max_equiv_tiles_ + rem_;
+    }
+
+    /**
+     * @brief Calculate the number of tiles needed for the number of sk blocks
+     */
+    CK_TILE_HOST_DEVICE uint32_t GetTilesCoverSkBlock(uint32_t num_sk_blocks_,
+                                                      uint32_t iters_per_sk_block_) const noexcept
+    {
+        return k_iters_per_tile.div(num_sk_blocks_ * iters_per_sk_block_ + k_iters_per_tile.get() -
+                                    1);
+    }
+
+    /**
+     * @brief Calculate the amount of total accumulation buffers required for stream-k
+     */
+    CK_TILE_HOST_DEVICE uint32_t GetTotalAccBuffers() const noexcept
+    {
+        uint32_t tiles_cover_big_blocks =
+            GetTilesCoverSkBlock(sk_num_big_blocks, k_iters_per_big_block);
+        uint32_t tiles_cover_little_blocks =
+            GetTilesCoverSkBlock(sk_num_blocks - sk_num_big_blocks, k_iters_per_big_block - 1);
+
+        uint32_t total_intersec_big = GetTileIntersections(tiles_cover_big_blocks, equiv_tiles_big);
+        uint32_t total_intersec_little =
+            GetTileIntersections(tiles_cover_little_blocks, equiv_tiles_little);
+
+        return sk_num_blocks + total_intersec_big + total_intersec_little;
+    }
+
+    /**
+     * @brief Calculate offset based on tile index for big/little tiles
+     */
+    CK_TILE_DEVICE uint32_t GetAccBufferOffsetFromTile(uint32_t tile_idx_) const noexcept
+    {
+        uint32_t tiles_cover_big_blocks =
+            GetTilesCoverSkBlock(sk_num_big_blocks, k_iters_per_big_block);
+        if(tile_idx_ < tiles_cover_big_blocks)
+        {
+            uint32_t touched_sk_blocks =
+                (tile_idx_ * k_iters_per_tile.get() + k_iters_per_big_block - 1) /
+                k_iters_per_big_block;
+            uint32_t current_intersec = GetTileIntersections(tile_idx_, equiv_tiles_big);
+            return touched_sk_blocks + current_intersec;
+        }
+        else
+        {
+            uint32_t iters_per_little_sk_block = k_iters_per_big_block - 1;
+            uint32_t tile_idx_little_reverse   = GetSkTiles() - tile_idx_;
+            uint32_t touched_sk_blocks =
+                (tile_idx_little_reverse * k_iters_per_tile.get() + iters_per_little_sk_block - 1) /
+                iters_per_little_sk_block;
+            uint32_t current_intersec =
+                GetTileIntersections(tile_idx_little_reverse, equiv_tiles_little);
+            return GetTotalAccBuffers() - (touched_sk_blocks + current_intersec);
+        }
+    }
+
+    /**
+     * @brief Calculate offset based on block_idx index for big/little streamk blocks
+     */
+    CK_TILE_DEVICE uint32_t GetAccBufferOffsetFromBlock(uint32_t block_idx_) const noexcept
+    {
+        uint32_t iters_per_big_sk_block    = k_iters_per_big_block;
+        uint32_t iters_per_little_sk_block = k_iters_per_big_block - 1;
+        if(block_idx_ < sk_num_big_blocks)
+        {
+            uint32_t touched_tiles    = k_iters_per_tile.div(block_idx_ * iters_per_big_sk_block +
+                                                          k_iters_per_tile.get() - 1);
+            uint32_t current_intersec = GetTileIntersections(touched_tiles, equiv_tiles_big);
+            return block_idx_ + current_intersec;
+        }
+        else
+        {
+            uint32_t block_idx_little_reverse = sk_num_blocks - block_idx_;
+            uint32_t touched_tiles            = k_iters_per_tile.div(
+                block_idx_little_reverse * iters_per_little_sk_block + k_iters_per_tile.get() - 1);
+            uint32_t current_intersec = GetTileIntersections(touched_tiles, equiv_tiles_little);
+            return GetTotalAccBuffers() - (block_idx_little_reverse + current_intersec);
+        }
+    }
+
+    // Getters for problem dimensions
+    CK_TILE_HOST_DEVICE uint32_t GetNumTileM() const noexcept { return num_tile_m_; }
+    CK_TILE_HOST_DEVICE uint32_t GetNumTileN() const noexcept { return num_tile_n_; }
+    CK_TILE_HOST_DEVICE uint32_t GetNumTileK() const noexcept { return num_tile_k_; }
+
+    uint32_t sk_num_blocks;
+    uint32_t sk_num_big_blocks;
+    uint32_t dp_start_block_idx;
+    uint32_t reduction_start_block_idx;
+    uint32_t k_iters_per_big_block;
+    mdiv2 n_tiles;
+    mdiv k_iters_per_tile;
+    mdiv equiv_tiles_big;    // for reduction
+    mdiv equiv_tiles_little; // for reduction
+
+    private:
+    uint32_t M_, N_, K_;
+    uint32_t num_tile_m_, num_tile_n_, num_tile_k_;
+};
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index eac7f547c1..551dc6f50d 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -23,10 +23,13 @@ namespace ck_tile {
 ///      arguments object. It contain all necessary information required to build proper kernel
 ///      argument and launch kernel on GPU. This structure defines the GEMM problem configuration by
 ///      stating all required information like M,N,K sizes and respective strides.
+
+template <index_t NumDTensor = 0>
 struct GroupedGemmHostArgs
 {
     CK_TILE_HOST GroupedGemmHostArgs(const void* a_ptr_,
                                      const void* b_ptr_,
+                                     const std::array<const void*, NumDTensor>& ds_ptr_,
                                      void* e_ptr_,
                                      index_t k_batch_,
                                      index_t M_,
@@ -34,15 +37,18 @@ struct GroupedGemmHostArgs
                                      index_t K_,
                                      index_t stride_A_,
                                      index_t stride_B_,
+                                     const std::array<index_t, NumDTensor>& stride_Ds_,
                                      index_t stride_E_)
         : a_ptr(a_ptr_),
           b_ptr(b_ptr_),
+          ds_ptr(ds_ptr_),
           e_ptr(e_ptr_),
           M(M_),
           N(N_),
           K(K_),
           stride_A(stride_A_),
           stride_B(stride_B_),
+          stride_Ds(stride_Ds_),
           stride_E(stride_E_),
           k_batch(k_batch_)
     {
@@ -50,6 +56,7 @@ struct GroupedGemmHostArgs
 
     const void* a_ptr;
     const void* b_ptr;
+    const std::array<const void*, NumDTensor> ds_ptr;
     union
     {
         void* e_ptr;
@@ -61,7 +68,7 @@ struct GroupedGemmHostArgs
     index_t K;
     index_t stride_A;
     index_t stride_B;
-
+    const std::array<index_t, NumDTensor> stride_Ds;
     union
     {
         index_t stride_E;
@@ -71,20 +78,23 @@ struct GroupedGemmHostArgs
     index_t k_batch;
 };
 
+template <index_t NumDTensor = 0>
 struct GemmTransKernelArg
 {
-    UniversalGemmKernelArgs<> group_karg;
+    UniversalGemmKernelArgs<1, 1, NumDTensor> group_karg;
     ck_tile::index_t block_start;
     ck_tile::index_t block_end;
 
     GemmTransKernelArg() = delete;
-    GemmTransKernelArg(UniversalGemmKernelArgs<>&& karg, index_t bl_start, index_t bl_end)
-        : group_karg{karg}, block_start{bl_start}, block_end{bl_end}
+    GemmTransKernelArg(UniversalGemmKernelArgs<1, 1, NumDTensor>&& karg,
+                       index_t bl_start,
+                       index_t bl_end)
+        : group_karg{std::move(karg)}, block_start{bl_start}, block_end{bl_end}
     {
     }
 
-    GemmTransKernelArg(UniversalGemmKernelArgs<>&& karg)
-        : group_karg{karg}, block_start{0}, block_end{0}
+    GemmTransKernelArg(UniversalGemmKernelArgs<1, 1, NumDTensor>&& karg)
+        : group_karg{std::move(karg)}, block_start{0}, block_end{0}
     {
     }
 };
@@ -106,9 +116,12 @@ struct GroupedGemmKernel
     using CLayout = remove_cvref_t<typename GemmPipeline::CLayout>;
 
     /// @brief Specify the data type configurations for A, B, C/E
-    using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
-    using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
-    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using ADataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType  = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using CDataType  = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using DsDataType = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
+
+    static constexpr index_t NumDTensor_ = DsDataType::size();
 
     /// @brief ALayout and ADataType are expected to be scalars, not a tuple.
     static_assert(
@@ -120,10 +133,10 @@ struct GroupedGemmKernel
         !is_detected<is_tuple, BLayout>::value && !is_detected<is_tuple, BDataType>::value,
         "BLayout and BDataType must be scalars. Multiple parameters are not currently supported.");
 
-    /// @brief  C/ELayout and C/EDataType are expected to be scalars, not a tuple.
+    /// @brief  C/CLayout and C/EDataType are expected to be scalars, not a tuple.
     static_assert(!is_detected<is_tuple, CLayout>::value &&
                       !is_detected<is_tuple, CDataType>::value,
-                  "C/ELayout and C/EDataType must be scalars.");
+                  "C/CLayout and C/EDataType must be scalars.");
 
     using OffsetTile1DPartitioner = OffsettedTile1DPartitioner<TilePartitioner>;
     using Kernel = GroupedGemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
@@ -140,19 +153,21 @@ struct GroupedGemmKernel
                       concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock),
                       concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
                       concat('x', P_::kPadM, P_::kPadN, P_::kPadK),
-                      (UsePersistentKernel ? "Persistent" : "NonPersistent"));
+                      (UsePersistentKernel ? "Persistent" : "NonPersistent"),
+                      (NumDTensor_ == 2 ? "MultiD" : "NoMultiD"),
+                      (GemmPipeline::DoubleSmemBuffer ? "DoubleSmemBuffer" : "SingleSmemBuffer"));
         // clang-format on
     }
 
     CK_TILE_HOST static auto
-    GetWorkSpaceSize(const std::vector<GroupedGemmHostArgs>& gemm_descs) -> std::size_t
+    GetWorkSpaceSize(const std::vector<GroupedGemmHostArgs<>>& gemm_descs) -> std::size_t
     {
-        return gemm_descs.size() * sizeof(GemmTransKernelArg);
+        return gemm_descs.size() * sizeof(GemmTransKernelArg<NumDTensor_>);
     }
 
     CK_TILE_HOST static auto GetWorkSpaceSize(index_t group_count) -> std::size_t
     {
-        return group_count * sizeof(GemmTransKernelArg);
+        return group_count * sizeof(GemmTransKernelArg<NumDTensor_>);
     }
 
     CK_TILE_HOST static auto BlockSize() -> dim3
@@ -184,7 +199,8 @@ struct GroupedGemmKernel
         return dim3(grid_size, 1, 1);
     }
 
-    CK_TILE_HOST static auto GridSize(const std::vector<GroupedGemmHostArgs>& gemm_descs)
+    CK_TILE_HOST static auto
+    GridSize(const std::vector<GroupedGemmHostArgs<NumDTensor_>>& gemm_descs)
     {
         index_t grid_size = 0;
         for(const auto& it_desc : gemm_descs)
@@ -196,9 +212,10 @@ struct GroupedGemmKernel
     }
 
     CK_TILE_HOST static auto
-    MakeKargs(const std::vector<GroupedGemmHostArgs>& gemm_descs) -> std::vector<GemmTransKernelArg>
+    MakeKargs(const std::vector<GroupedGemmHostArgs<NumDTensor_>>& gemm_descs)
+        -> std::vector<GemmTransKernelArg<NumDTensor_>>
     {
-        std::vector<GemmTransKernelArg> gemm_kernel_args_;
+        std::vector<GemmTransKernelArg<NumDTensor_>> gemm_kernel_args_;
         index_t group_count = ck_tile::type_convert<ck_tile::index_t>(gemm_descs.size());
         index_t grid_size   = 0;
         gemm_kernel_args_.reserve(group_count);
@@ -217,6 +234,7 @@ struct GroupedGemmKernel
             const index_t stride_a = gemm_descs[i].stride_A;
             const index_t stride_b = gemm_descs[i].stride_B;
             const index_t stride_e = gemm_descs[i].stride_E;
+            auto stride_ds         = gemm_descs[i].stride_Ds;
 
             const index_t grid_size_grp = TilePartitioner::GridSize(M, N) * gemm_descs[i].k_batch;
 
@@ -225,19 +243,19 @@ struct GroupedGemmKernel
 
             grid_size += grid_size_grp;
 
-            auto karg =
-                UniversalGemmKernelArgs<>{{type_convert<const ADataType*>(gemm_descs[i].a_ptr)},
-                                          {type_convert<const BDataType*>(gemm_descs[i].b_ptr)},
-                                          {/*ds_ptr*/},
-                                          type_convert<CDataType*>(gemm_descs[i].e_ptr),
-                                          M,
-                                          N,
-                                          K,
-                                          {stride_a},
-                                          {stride_b},
-                                          {/*stride_ds*/},
-                                          stride_e,
-                                          gemm_descs[i].k_batch};
+            auto karg = UniversalGemmKernelArgs<1, 1, NumDTensor_>{
+                {type_convert<const ADataType*>(gemm_descs[i].a_ptr)},
+                {type_convert<const BDataType*>(gemm_descs[i].b_ptr)},
+                {gemm_descs[i].ds_ptr},
+                type_convert<CDataType*>(gemm_descs[i].e_ptr),
+                M,
+                N,
+                K,
+                {stride_a},
+                {stride_b},
+                stride_ds,
+                stride_e,
+                gemm_descs[i].k_batch};
 
             gemm_kernel_args_.emplace_back(std::move(karg), block_start, block_end);
         }
@@ -245,7 +263,8 @@ struct GroupedGemmKernel
         return gemm_kernel_args_;
     }
 
-    CK_TILE_HOST static bool IsSupportedArgument(const std::vector<GemmTransKernelArg>& kargs)
+    CK_TILE_HOST static bool
+    IsSupportedArgument(const std::vector<GemmTransKernelArg<NumDTensor_>>& kargs)
     {
         for(const auto& karg : kargs)
         {
@@ -262,14 +281,18 @@ struct GroupedGemmKernel
         return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
-    CK_TILE_DEVICE void Run(const UniversalGemmKernelArgs<>& kargs,
+    CK_TILE_DEVICE void Run(const UniversalGemmKernelArgs<1, 1, NumDTensor_>& kargs,
                             const tuple<index_t, index_t>& block_idx_2d,
                             const index_t block_idx_z) const
     {
+
+        static_assert(GemmPipeline::DoubleSmemBuffer || !GemmPipeline::Preshuffle,
+                      "SingleSmemBuffer and Preshuffle cannot both be enabled simultaneously!");
+
         const auto [iM, iN] = block_idx_2d;
 
-        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
         const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z);
 
@@ -282,47 +305,43 @@ struct GroupedGemmKernel
         // allocate LDS
         __shared__ char smem_ptr_0[GetSmemSize()];
 
+        // TO DO:
+        // Can we simplify this branching logic?
         if constexpr(GemmPipeline::DoubleSmemBuffer == true)
         {
+
             __shared__ char smem_ptr_1[GetSmemSize()];
-            if constexpr(UsePersistentKernel)
-            {
-                RunGemmWithPipelineSelection2LDS(a_ptr,
-                                                 b_ptr,
-                                                 c_ptr,
-                                                 smem_ptr_0,
-                                                 smem_ptr_1,
-                                                 kargs,
-                                                 splitk_batch_offset,
-                                                 i_m,
-                                                 i_n);
-            }
-            else
-            {
-                Base::RunGemm2LDS({a_ptr},
-                                  {b_ptr},
-                                  {/*ds_ptr*/},
-                                  c_ptr,
-                                  smem_ptr_0,
-                                  smem_ptr_1,
-                                  kargs,
-                                  splitk_batch_offset,
-                                  i_m,
-                                  i_n);
-            }
+            RunGemmWithPipelineSelection2LDS(a_ptr,
+                                             b_ptr,
+                                             c_ptr,
+                                             kargs.ds_ptr,
+                                             smem_ptr_0,
+                                             smem_ptr_1,
+                                             kargs,
+                                             splitk_batch_offset,
+                                             i_m,
+                                             i_n);
         }
-        else
+        else // SingleSmemBuffer
         {
+
             if constexpr(UsePersistentKernel)
             {
-                RunGemmWithPipelineSelection(
-                    a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+                RunGemmWithPipelineSelection(a_ptr,
+                                             b_ptr,
+                                             kargs.ds_ptr,
+                                             c_ptr,
+                                             smem_ptr_0,
+                                             kargs,
+                                             splitk_batch_offset,
+                                             i_m,
+                                             i_n);
             }
-            else
+            else // Non-persistent kernel
             {
                 Base::RunGemm({a_ptr},
                               {b_ptr},
-                              {/*ds_ptr*/},
+                              kargs.ds_ptr,
                               c_ptr,
                               smem_ptr_0,
                               kargs,
@@ -354,9 +373,10 @@ struct GroupedGemmKernel
     CK_TILE_DEVICE static void
     RunGemmWithPipelineSelection(const ADataType* a_ptr,
                                  const BDataType* b_ptr,
+                                 const std::array<const void*, NumDTensor_>& ds_ptr,
                                  CDataType* c_ptr,
                                  void* smem_ptr_0,
-                                 const UniversalGemmKernelArgs<>& kargs,
+                                 const UniversalGemmKernelArgs<1, 1, NumDTensor_>& kargs,
                                  const typename Base::SplitKBatchOffset& splitk_batch_offset,
                                  const index_t block_idx_m,
                                  const index_t block_idx_n)
@@ -364,7 +384,7 @@ struct GroupedGemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset);
+                {a_ptr}, {b_ptr}, ds_ptr, c_ptr, kargs, splitk_batch_offset.splitted_k);
 
         const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows =
@@ -374,18 +394,14 @@ struct GroupedGemmKernel
         const auto& d_block_window = gemm_tile_windows.at(Base::I2);
 
         // Get hot-loop and tail configuration
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
         const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
         const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
 
         // Run GEMM pipeline
-        const auto& c_block_tile = GemmPipeline{}.template operator()(a_block_window[Base::I0],
-                                                                      b_block_window[Base::I0],
-                                                                      num_loop,
-                                                                      has_hot_loop,
-                                                                      tail_num,
-                                                                      smem_ptr_0);
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, has_hot_loop, tail_num, smem_ptr_0);
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(Base::I3);
         EpiloguePipeline{}.template
@@ -416,9 +432,10 @@ struct GroupedGemmKernel
     RunGemmWithPipelineSelection2LDS(const ADataType* a_ptr,
                                      const BDataType* b_ptr,
                                      CDataType* c_ptr,
+                                     const std::array<const void*, NumDTensor_>& ds_ptr,
                                      void* __restrict__ smem_ptr_0,
                                      void* __restrict__ smem_ptr_1,
-                                     const UniversalGemmKernelArgs<>& kargs,
+                                     const UniversalGemmKernelArgs<1, 1, NumDTensor_>& kargs,
                                      const typename Base::SplitKBatchOffset& splitk_batch_offset,
                                      const index_t block_idx_m,
                                      const index_t block_idx_n)
@@ -426,7 +443,7 @@ struct GroupedGemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset);
+                {a_ptr}, {b_ptr}, ds_ptr, c_ptr, kargs, splitk_batch_offset.splitted_k);
 
         const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows =
@@ -436,19 +453,36 @@ struct GroupedGemmKernel
         const auto& d_block_window = gemm_tile_windows.at(Base::I2);
 
         // Get hot-loop and tail configuration
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
-        const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
         const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
 
-        // Run GEMM pipeline
-        const auto& c_block_tile = GemmPipeline{}.template operator()(a_block_window[Base::I0],
-                                                                      b_block_window[Base::I0],
-                                                                      num_loop,
-                                                                      has_hot_loop,
-                                                                      tail_num,
-                                                                      smem_ptr_0,
-                                                                      smem_ptr_1);
+        // Run GEMM pipeline with compile-time branching
+        const auto& c_block_tile = [&]() {
+            if constexpr(GemmPipeline::Preshuffle)
+            {
+                // Preshuffle version - without has_hot_loop parameter
+                return GemmPipeline{}.template operator()(a_block_window[Base::I0],
+                                                          b_block_window[Base::I0],
+                                                          num_loop,
+                                                          tail_num,
+                                                          smem_ptr_0,
+                                                          smem_ptr_1);
+            }
+            else
+            {
+                // Regular version - with has_hot_loop parameter
+                const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop);
+                return GemmPipeline{}.template operator()(a_block_window[Base::I0],
+                                                          b_block_window[Base::I0],
+                                                          num_loop,
+                                                          has_hot_loop,
+                                                          tail_num,
+                                                          smem_ptr_0,
+                                                          smem_ptr_1);
+            }
+        }();
+
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(Base::I3);
         EpiloguePipeline{}.template
@@ -456,7 +490,7 @@ struct GroupedGemmKernel
             c_block_window, c_block_tile, d_block_window, smem_ptr_0);
     }
 
-    CK_TILE_DEVICE index_t FindGroupId(const GemmTransKernelArg* gemm_desc_ptr,
+    CK_TILE_DEVICE index_t FindGroupId(const GemmTransKernelArg<NumDTensor_>* gemm_desc_ptr,
                                        index_t block_id,
                                        index_t group_count) const
     {
@@ -488,11 +522,12 @@ struct GroupedGemmKernel
                                    index_t group_count) const
     {
         const index_t block_id   = ck_tile::get_block_1d_id();
-        const auto gemm_desc_ptr = reinterpret_cast<const GemmTransKernelArg*>(
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmTransKernelArg<NumDTensor_>*>(
             cast_pointer_to_generic_address_space(gemm_descs_const));
 
-        const index_t group_id  = FindGroupId(gemm_desc_ptr, block_id, group_count);
-        const auto& kargs       = gemm_desc_ptr[group_id];
+        const index_t group_id = FindGroupId(gemm_desc_ptr, block_id, group_count);
+        const auto& kargs      = gemm_desc_ptr[group_id];
+
         const auto grid_size_2d = TilePartitioner::GridSize(kargs.group_karg.M, kargs.group_karg.N);
         const auto block_idx_2d = OffsetTile1DPartitioner::GetOffsetedTileIndex(
             0,
@@ -510,7 +545,7 @@ struct GroupedGemmKernel
                                    const index_t group_count) const
     {
         const index_t grid_size  = ck_tile::get_grid_size();
-        const auto gemm_desc_ptr = reinterpret_cast<const GemmTransKernelArg*>(
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmTransKernelArg<NumDTensor_>*>(
             cast_pointer_to_generic_address_space(gemm_descs_const));
         index_t block_id      = ck_tile::get_block_1d_id(); // initial block_id
         index_t cum_grid_size = 0;
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index a05e7b2ad0..ad85b5392d 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -9,15 +9,6 @@
 
 namespace ck_tile {
 
-enum StreamKReductionStrategy : uint32_t
-{
-    /// @brief Workgroups atomically add their results to the C tensor
-    Atomic = 0u,
-    /// @brief For a given tile in the C tensor, one workgroup accumulates results of other
-    /// contributing workgroups
-    Reduction = 1u
-};
-
 /// @brief The Stream K GEMM kernel host arguments.
 ///
 /// @par Overview
@@ -37,7 +28,7 @@ struct StreamKHostArgs : public ck_tile::UniversalGemmHostArgs<>
                                           index_t stride_B_,
                                           index_t stride_C_,
                                           StreamKReductionStrategy reduction_strategy_,
-                                          index_t num_sk_blocks_ = -1)
+                                          uint32_t num_sk_blocks_ = 0xffffffff)
         : UniversalGemmHostArgs<>({a_ptr_},
                                   {b_ptr_},
                                   {/*ds_ptr*/},
@@ -56,7 +47,7 @@ struct StreamKHostArgs : public ck_tile::UniversalGemmHostArgs<>
     }
 
     ck_tile::StreamKReductionStrategy reduction_strategy;
-    index_t num_sk_blocks;
+    uint32_t num_sk_blocks;
 };
 
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
@@ -103,7 +94,7 @@ struct StreamKKernel
         /// @brief  The strategy used by work groups to compute final results in C tensor.
         StreamKReductionStrategy reduction_strategy;
         /// @brief  The number of stream k blocks.
-        index_t num_sk_blocks;
+        uint32_t num_sk_blocks;
         /// @brief  A pointer to a buffer in device memory for accumulating partial via reduction
         /// strategy.
         void* workspace_ptr;
@@ -150,36 +141,102 @@ struct StreamKKernel
         return UniversalGemmKernel::BlockSize();
     }
 
-    CK_TILE_HOST static StreamKKernelArgs MakeKernelArgs(const StreamKHostArgs& host_args)
+    /// @brief Constructs kernel arguments for the Stream-K kernel.
+    /// @param host_args Stream-K host arguments.
+    /// @param num_cu Number of compute units (CUs). The default is the number of CUs on the device.
+    /// The caller may select their own to assist with test reproducibility, etc.
+    /// @param occupancy The maximum number of active blocks per CU for this kernel. The caller may
+    /// select their own to assist with test reproducibility, etc.
+    /// @return The kernel arguments for Stream-K.
+    CK_TILE_HOST static StreamKKernelArgs MakeKernelArgs(const StreamKHostArgs& host_args,
+                                                         int num_cu    = NumCU(),
+                                                         int occupancy = Occupancy())
     {
-        index_t occupancy = static_cast<index_t>(Occupancy());
-        index_t num_cu    = static_cast<index_t>(NumCU());
-
-        return StreamKKernelArgs{
-            {host_args.as_ptr,
-             host_args.bs_ptr,
-             host_args.ds_ptr,
-             host_args.e_ptr,
-             host_args.M,
-             host_args.N,
-             host_args.K,
-             host_args.stride_As,
-             host_args.stride_Bs,
-             host_args.stride_Ds,
-             host_args.stride_E,
-             host_args.k_batch},
-            host_args.reduction_strategy,
-            host_args.num_sk_blocks,
-            // The workspace pointer is set to nullptr because we must first
-            // instantiate the TilePartitioner to get the necessary size
-            /*workspace_ptr =*/nullptr,
-            TilePartitioner{
-                host_args.M, host_args.N, host_args.K, num_cu, occupancy, host_args.num_sk_blocks}};
+        return StreamKKernelArgs{{host_args.as_ptr,
+                                  host_args.bs_ptr,
+                                  host_args.ds_ptr,
+                                  host_args.e_ptr,
+                                  host_args.M,
+                                  host_args.N,
+                                  host_args.K,
+                                  host_args.stride_As,
+                                  host_args.stride_Bs,
+                                  host_args.stride_Ds,
+                                  host_args.stride_E,
+                                  host_args.k_batch},
+                                 host_args.reduction_strategy,
+                                 host_args.num_sk_blocks,
+                                 // The workspace pointer is set to nullptr because we must first
+                                 // instantiate the TilePartitioner to get the necessary size
+                                 /*workspace_ptr =*/nullptr,
+                                 TilePartitioner{static_cast<uint32_t>(host_args.M),
+                                                 static_cast<uint32_t>(host_args.N),
+                                                 static_cast<uint32_t>(host_args.K),
+                                                 static_cast<uint32_t>(num_cu),
+                                                 static_cast<uint32_t>(occupancy),
+                                                 host_args.num_sk_blocks}};
     }
 
-    CK_TILE_HOST static bool
-    IsSupportedArgument(const typename UniversalGemmKernel::KernelArgs& kargs)
+    template <bool UseDefaultScheduler = true>
+    CK_TILE_DEVICE static void
+    RunGemm(const std::array<const ADataType*, UniversalGemmKernel::NumATensor>& as_ptr,
+            const std::array<const BDataType*, UniversalGemmKernel::NumBTensor>& bs_ptr,
+            const std::array<const void*, UniversalGemmKernel::NumDTensor>& ds_ptr,
+            CDataType* c_ptr,
+            void* smem_ptr_0,
+            const typename UniversalGemmKernel::KernelArgs& kargs,
+            const index_t num_loop,
+            const index_t block_idx_m,
+            const index_t block_idx_n,
+            const index_t k_size)
     {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            UniversalGemmKernel::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                as_ptr, bs_ptr, ds_ptr, c_ptr, kargs, k_size);
+
+        const auto& gemm_pad_views = UniversalGemmKernel::MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows =
+            UniversalGemmKernel::MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& as_block_window = gemm_tile_windows.at(UniversalGemmKernel::I0);
+        const auto& bs_block_window = gemm_tile_windows.at(UniversalGemmKernel::I1);
+        const auto& ds_block_window = gemm_tile_windows.at(UniversalGemmKernel::I2);
+
+        // Since num_loop can vary per WG and per iteration of the Stream-K while loop, we compute
+        // has_hot_loop and tail_num here. This is a similar pattern used by grouped GEMM. In this
+        // case, we call the GemmPipeline's operator() function that takes both has_hot_loop and
+        // tail_num.
+        const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
+        const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        const auto& c_block_tile = GemmPipeline{}(as_block_window[UniversalGemmKernel::I0],
+                                                  bs_block_window[UniversalGemmKernel::I0],
+                                                  num_loop,
+                                                  has_hot_loop,
+                                                  tail_num,
+                                                  smem_ptr_0);
+
+        if(UseDefaultScheduler || (get_warp_id() == 0))
+        {
+            // Run Epilogue Pipeline
+            auto& c_block_window = gemm_tile_windows.at(UniversalGemmKernel::I3);
+
+            EpiloguePipeline{}(c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
+        }
+    }
+
+    CK_TILE_HOST static bool IsSupportedArgument(const StreamKKernelArgs& kargs)
+    {
+        if(kargs.reduction_strategy == StreamKReductionStrategy::Reduction)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("CK Tile Stream-K only supports the atomic reduction strategy.");
+            }
+            return false;
+        }
         return UniversalGemmKernel::IsSupportedArgument(kargs);
     }
 
@@ -205,9 +262,81 @@ struct StreamKKernel
         kargs.workspace_ptr = workspace_ptr;
     }
 
-    // Temporary placeholder to support the Occupancy() static function.
-    // Since the Occupancy function uses kentry, this class must have an operator() function
-    CK_TILE_DEVICE void operator()(StreamKKernelArgs /*kargs*/) const {}
+    /// @brief Entry point for the Stream-K Kernel, performing the main Stream-K loop.
+    CK_TILE_DEVICE void operator()(StreamKKernelArgs kargs) const
+    {
+        // Allocate LDS
+        __shared__ char smem_ptr_0[UniversalGemmKernel::GetSmemSize()];
+
+        uint32_t block_idx = ck_tile::get_block_1d_id();
+
+        bool is_padding_block =
+            amd_wave_read_first_lane(block_idx >= kargs.tile_partitioner.sk_num_blocks &&
+                                     block_idx < kargs.tile_partitioner.dp_start_block_idx);
+
+        // Padding blocks make it such that the DP blocks are aligned with the number of CUs; they
+        // should not partake in the GEMM
+        if(is_padding_block)
+            return;
+
+        // Determine the K offset of the first and final macro tile in the A and B tensors along the
+        // K dimension.
+        uint32_t iter_start, iter_end;
+        kargs.tile_partitioner.GetBlockItr(block_idx, iter_start, iter_end);
+
+        // Main Stream-K loop
+        while(true)
+        {
+            // Determine the number of macro tiles in A and B this WG is resposible for in the
+            // current C macro tile.
+            uint32_t current_iter_length = amd_wave_read_first_lane(
+                kargs.tile_partitioner.GetCurrentIterLength(iter_start, iter_end));
+
+            // Determine the 1D tile_idx and the iter_offset for this WG.
+            // The tile_idx is the 1D macro tile index in the C tensor.
+            // The iter_offset is the starting macro tile index in the K dimension for the WG in the
+            // current iteration of the while loop.
+            uint32_t tile_idx, iter_offset;
+            kargs.tile_partitioner.GetTileIdxWithOffset(iter_start, tile_idx, iter_offset);
+
+            // Get the 2D tile index in the C tensor for this WG using the 1D index (i.e. tile_idx)
+            auto spatial_idx = kargs.tile_partitioner.GetOutputTileIndex(tile_idx);
+
+            // Get the offsets in A, B, C tensors.
+            index_t i_m = static_cast<index_t>(spatial_idx[UniversalGemmKernel::I0] *
+                                               TilePartitioner::MPerBlock);
+            index_t i_n = static_cast<index_t>(spatial_idx[UniversalGemmKernel::I1] *
+                                               TilePartitioner::NPerBlock);
+            index_t i_k = static_cast<index_t>(iter_offset) * TilePartitioner::KPerBlock;
+
+            // Determine the total size along the K dimension the WG is using in this iteration
+            // (used to construct tensor views).
+            index_t k_size = static_cast<index_t>(current_iter_length * TilePartitioner::KPerBlock);
+
+            // Update pointer offsets for A, B, and C.
+            const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) + i_k;
+            const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) + i_k;
+            CDataType* c_ptr       = static_cast<CDataType*>(kargs.e_ptr);
+
+            // Run the GEMM pipeline and Epilogue.
+            RunGemm({a_ptr},
+                    {b_ptr},
+                    {/*ds_ptr*/},
+                    c_ptr,
+                    smem_ptr_0,
+                    kargs,
+                    current_iter_length,
+                    i_m,
+                    i_n,
+                    k_size);
+
+            // Prepare for next Stream-K loop iteration.
+            iter_start += current_iter_length;
+            if(iter_end <= iter_start)
+                break;
+            block_sync_lds();
+        }
+    }
 
     private:
     CK_TILE_HOST static int NumCU()
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
index 8117d65758..51ad4e3dd1 100644
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -157,23 +157,23 @@ struct UniversalGemmKernel
     using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
 
     static constexpr bool ADataTypeIsTuple =
-        is_detected<is_tuple, typename GemmPipeline::ADataType>::value;
+        is_detected<is_tuple, typename GemmPipeline::AsDataType>::value;
     static constexpr bool BDataTypeIsTuple =
-        is_detected<is_tuple, typename GemmPipeline::BDataType>::value;
+        is_detected<is_tuple, typename GemmPipeline::BsDataType>::value;
     static constexpr bool DDataTypeIsTuple =
         is_detected<is_tuple, typename EpiloguePipeline::DsDataType>::value;
     static constexpr bool ALayoutIsTuple =
-        is_detected<is_tuple, typename GemmPipeline::ALayout>::value;
+        is_detected<is_tuple, typename GemmPipeline::AsLayout>::value;
     static constexpr bool BLayoutIsTuple =
-        is_detected<is_tuple, typename GemmPipeline::BLayout>::value;
+        is_detected<is_tuple, typename GemmPipeline::BsLayout>::value;
     static constexpr bool DLayoutIsTuple =
         is_detected<is_tuple, typename EpiloguePipeline::DsLayout>::value;
 
     using AsLayout = std::conditional_t<ALayoutIsTuple,
-                                        remove_cvref_t<typename GemmPipeline::ALayout>,
+                                        remove_cvref_t<typename GemmPipeline::AsLayout>,
                                         remove_cvref_t<tuple<typename GemmPipeline::ALayout>>>;
     using BsLayout = std::conditional_t<BLayoutIsTuple,
-                                        remove_cvref_t<typename GemmPipeline::BLayout>,
+                                        remove_cvref_t<typename GemmPipeline::BsLayout>,
                                         remove_cvref_t<tuple<typename GemmPipeline::BLayout>>>;
 
     using DsLayout = std::conditional_t<DLayoutIsTuple,
@@ -181,11 +181,11 @@ struct UniversalGemmKernel
                                         remove_cvref_t<tuple<typename EpiloguePipeline::DsLayout>>>;
 
     using AsDataType = std::conditional_t<ADataTypeIsTuple,
-                                          remove_cvref_t<typename GemmPipeline::ADataType>,
+                                          remove_cvref_t<typename GemmPipeline::AsDataType>,
                                           remove_cvref_t<tuple<typename GemmPipeline::ADataType>>>;
 
     using BsDataType = std::conditional_t<BDataTypeIsTuple,
-                                          remove_cvref_t<typename GemmPipeline::BDataType>,
+                                          remove_cvref_t<typename GemmPipeline::BsDataType>,
                                           remove_cvref_t<tuple<typename GemmPipeline::BDataType>>>;
 
     using DsDataType =
@@ -193,9 +193,12 @@ struct UniversalGemmKernel
                            remove_cvref_t<typename EpiloguePipeline::DsDataType>,
                            remove_cvref_t<tuple<typename EpiloguePipeline::DsDataType>>>;
 
-    using ELayout   = remove_cvref_t<typename GemmPipeline::CLayout>;
+    using CLayout   = remove_cvref_t<typename GemmPipeline::CLayout>;
     using EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
+    using AElementWise = remove_cvref_t<typename GemmPipeline::AElementWise>;
+    using BElementWise = remove_cvref_t<typename GemmPipeline::BElementWise>;
+
     static constexpr index_t kBlockSize = GemmPipeline::BlockSize;
 
     // Get the persistent kernel if the pipeline has it available
@@ -323,19 +326,19 @@ struct UniversalGemmKernel
         __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
         {
             constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
-            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
-            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
+            const index_t K_t   = amd_wave_read_first_lane(kargs.k_batch * K1);
+            const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1);
 
             static_for<0, NumATensor, 1>{}([&](auto index) {
                 using AiLayout = remove_cvref_t<std::tuple_element_t<index.value, AsLayout>>;
                 if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, AiLayout>)
                 {
-                    as_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
+                    as_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
                 }
                 else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, AiLayout>)
                 {
                     as_k_split_offset[index] =
-                        __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_As[index]);
+                        amd_wave_read_first_lane(k_id * KRead * kargs.stride_As[index]);
                 }
             });
 
@@ -344,21 +347,21 @@ struct UniversalGemmKernel
                 if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BiLayout>)
                 {
                     bs_k_split_offset[index] =
-                        __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_Bs[index]);
+                        amd_wave_read_first_lane(k_id * KRead * kargs.stride_Bs[index]);
                 }
                 else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BiLayout>)
                 {
-                    bs_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
+                    bs_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
                 }
             });
 
             if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
             {
-                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
+                splitted_k = amd_wave_read_first_lane(KRead);
             }
             else
             {
-                splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
+                splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1));
             }
         }
 
@@ -483,7 +486,7 @@ struct UniversalGemmKernel
         bool DTesnorIsValid = {true};
         static_for<0, NumDTensor, 1>{}([&](auto index) {
             using DiLayout = remove_cvref_t<std::tuple_element_t<index.value, DsLayout>>;
-            if(std::is_same_v<DiLayout, ELayout> == false)
+            if(std::is_same_v<DiLayout, CLayout> == false)
             {
                 DTesnorIsValid = false;
             }
@@ -529,7 +532,7 @@ struct UniversalGemmKernel
             }
         });
 
-        if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
         {
             if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
             {
@@ -579,7 +582,7 @@ struct UniversalGemmKernel
                         const std::array<const void*, NumDTensor>& ds_ptr,
                         EDataType* e_ptr,
                         const KernelArgs& kargs,
-                        const SplitKBatchOffset& splitk_batch_offset)
+                        const index_t k_size)
     {
         static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
 
@@ -591,7 +594,7 @@ struct UniversalGemmKernel
                 {
                     return make_naive_tensor_view<address_space_enum::global>(
                         static_cast<const AiDataType*>(as_ptr[i]),
-                        make_tuple(kargs.M, splitk_batch_offset.splitted_k),
+                        make_tuple(kargs.M, k_size),
                         make_tuple(kargs.stride_As[i], 1),
                         number<GemmPipeline::GetVectorSizeA()>{},
                         number<1>{});
@@ -600,7 +603,7 @@ struct UniversalGemmKernel
                 {
                     return make_naive_tensor_view<address_space_enum::global>(
                         static_cast<const AiDataType*>(as_ptr[i]),
-                        make_tuple(splitk_batch_offset.splitted_k, kargs.M),
+                        make_tuple(k_size, kargs.M),
                         make_tuple(kargs.stride_As[i], 1),
                         number<GemmPipeline::GetVectorSizeA()>{},
                         number<1>{});
@@ -617,7 +620,7 @@ struct UniversalGemmKernel
                     if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
                     {
                         constexpr index_t K1 = GemmPipeline::GetSmemPackB();
-                        const index_t K0     = splitk_batch_offset.splitted_k / K1;
+                        const index_t K0     = k_size / K1;
                         constexpr index_t VectorSizeB =
                             std::min(K1, GemmPipeline::GetVectorSizeB());
                         const auto b_k0_n_k1_desc =
@@ -638,7 +641,7 @@ struct UniversalGemmKernel
                     {
                         return make_naive_tensor_view<address_space_enum::global>(
                             bs_ptr[i],
-                            make_tuple(splitk_batch_offset.splitted_k, kargs.N),
+                            make_tuple(k_size, kargs.N),
                             make_tuple(kargs.stride_Bs[i], 1),
                             number<GemmPipeline::GetVectorSizeB()>{},
                             number<1>{});
@@ -649,7 +652,7 @@ struct UniversalGemmKernel
                     if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
                     {
                         constexpr index_t K1 = GemmPipeline::GetSmemPackB();
-                        const index_t K0     = splitk_batch_offset.splitted_k / K1;
+                        const index_t K0     = k_size / K1;
                         constexpr index_t VectorSizeB =
                             std::min(K1, GemmPipeline::GetVectorSizeB());
                         const auto b_k0_n_k1_desc =
@@ -672,7 +675,7 @@ struct UniversalGemmKernel
                         {
                             index_t kFlatK =
                                 GemmPipeline::BlockGemmShape::flatKPerWarp *
-                                (splitk_batch_offset.splitted_k /
+                                (k_size /
                                  TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}));
                             index_t kFlatN = kargs.N * kargs.K / kFlatK;
 
@@ -687,7 +690,7 @@ struct UniversalGemmKernel
                         {
                             return make_naive_tensor_view<address_space_enum::global>(
                                 bs_ptr[i],
-                                make_tuple(kargs.N, splitk_batch_offset.splitted_k),
+                                make_tuple(kargs.N, k_size),
                                 make_tuple(kargs.stride_Bs[i], 1),
                                 number<GemmPipeline::GetVectorSizeB()>{},
                                 number<1>{});
@@ -724,7 +727,7 @@ struct UniversalGemmKernel
 
         // TODO: enable vector write for C in ColMajor
         const auto& e_tensor_view = [&]() {
-            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
             {
                 return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
                     e_ptr,
@@ -818,7 +821,7 @@ struct UniversalGemmKernel
         // TODO vector write in for C in ColMajor
         const auto& e_pad_view = [&]() {
             const auto& e_tensor_view = views.at(I3);
-            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
             {
                 return pad_tensor_view(e_tensor_view,
                                        make_tuple(number<TilePartitioner::MPerBlock>{},
@@ -962,21 +965,21 @@ struct UniversalGemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
+                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset.splitted_k);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& as_block_window = gemm_tile_windows.at(I0);
         const auto& bs_block_window = gemm_tile_windows.at(I1);
         const auto& ds_block_window = gemm_tile_windows.at(I2);
 
-        const auto& c_block_tile =
-            GemmPipeline{}(as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0);
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            as_block_window, AElementWise{}, bs_block_window, BElementWise{}, num_loop, smem_ptr_0);
 
         if(UseDefaultScheduler || (get_warp_id() == 0))
         {
@@ -1018,21 +1021,26 @@ struct UniversalGemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
+                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset.splitted_k);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& as_block_window = gemm_tile_windows.at(I0);
         const auto& bs_block_window = gemm_tile_windows.at(I1);
         const auto& ds_block_window = gemm_tile_windows.at(I2);
 
-        const auto& c_block_tile = GemmPipeline{}(
-            as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0, smem_ptr_1);
+        const auto& c_block_tile = GemmPipeline{}.template operator()(as_block_window,
+                                                                      AElementWise{},
+                                                                      bs_block_window,
+                                                                      BElementWise{},
+                                                                      num_loop,
+                                                                      smem_ptr_0,
+                                                                      smem_ptr_1);
 
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(I3);
@@ -1044,10 +1052,10 @@ struct UniversalGemmKernel
     template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
     CK_TILE_DEVICE void operator()(KernelArgs kargs) const
     {
-        const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto blockId  = amd_wave_read_first_lane(blockIdx.x);
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
         const SplitKBatchOffset splitk_batch_offset(kargs);
 
@@ -1118,22 +1126,22 @@ struct UniversalGemmKernel
     template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
     CK_TILE_DEVICE void operator()(KernelArgs kargs) const
     {
-        const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
+        const auto grid_size = amd_wave_read_first_lane(get_grid_size());
         const auto num_tiles =
-            __builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N));
-        const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch);
-        auto block_id       = __builtin_amdgcn_readfirstlane(get_block_id());
+            amd_wave_read_first_lane(TilePartitioner::GridSize(kargs.M, kargs.N));
+        const auto num_work = amd_wave_read_first_lane(num_tiles * kargs.k_batch);
+        auto block_id       = amd_wave_read_first_lane(get_block_id());
 
         while(block_id < num_work)
         {
             // Get the tile index for this block
-            const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles);
+            const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles);
             const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
-            const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-            const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+            const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+            const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
             // Get the SplitK offset for this block
-            const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles);
+            const auto k_batch = amd_wave_read_first_lane(block_id / num_tiles);
             const SplitKBatchOffset splitk_batch_offset(kargs, k_batch);
 
             std::array<const ADataType*, NumATensor> as_ptr;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
index ad1624a99b..98fdd62886 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -11,12 +11,17 @@ namespace ck_tile {
 template <typename Problem, typename Policy>
 struct GemmPipelineAgBgCrImplBase
 {
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-    using ALayout        = remove_cvref_t<typename Problem::ALayout>;
-    using BLayout        = remove_cvref_t<typename Problem::BLayout>;
+    using AsDataType     = remove_cvref_t<typename Problem::AsDataTypeTuple>;
+    using BsDataType     = remove_cvref_t<typename Problem::BsDataTypeTuple>;
+    using AsLayout       = remove_cvref_t<typename Problem::AsLayoutTuple>;
+    using BsLayout       = remove_cvref_t<typename Problem::BsLayoutTuple>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
+    using ADataType = remove_cvref_t<std::tuple_element_t<number<0>{}, AsDataType>>;
+    using ALayout   = remove_cvref_t<std::tuple_element_t<number<0>{}, AsLayout>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<number<0>{}, BsDataType>>;
+    using BLayout   = remove_cvref_t<std::tuple_element_t<number<0>{}, BsLayout>>;
+
     static constexpr index_t MPerBlock = BlockGemmShape::kM;
     static constexpr index_t NPerBlock = BlockGemmShape::kN;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
@@ -53,8 +58,15 @@ struct GemmPipelineAgBgCrImplBase
                                      const SrcBlockTile& src_block_tile,
                                      const ElementFunction& element_func) const
     {
-        const auto block_tile = tile_elementwise_in(element_func, src_block_tile);
-        store_tile(lds_tile_window, block_tile);
+        const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile);
+        store_tile(lds_tile_window, block_tile_tmp);
+    }
+
+    template <typename DstTileWindow, typename SrcBlockTile>
+    CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window,
+                                     const SrcBlockTile& src_block_tile) const
+    {
+        store_tile(lds_tile_window, src_block_tile);
     }
 
     template <typename DstBlockTile, typename SrcTileWindow, bool LoadTranspose = false>
@@ -91,35 +103,156 @@ struct GemmPipelineAgBgCrImplBase
         return make_tuple(std::move(a_lds_block), std::move(b_lds_block));
     }
 
-    template <typename ADramBlockWindow, typename ALdsTensorView, typename ALdsLoadTileDistr>
-    CK_TILE_DEVICE constexpr auto GetAWindows(const ADramBlockWindow& a_dram_block_window,
-                                              const ALdsTensorView& a_lds_block_view,
-                                              const ALdsLoadTileDistr&,
-                                              const array<index_t, 2>& offset = {0, 0}) const
+    template <typename DramBlockWindowTmp,
+              typename LdsLoadTileDistr,
+              typename std::enable_if_t<is_detected<is_tuple, DramBlockWindowTmp>::value, bool>* =
+                  nullptr>
+    CK_TILE_DEVICE constexpr auto CopyADramWindow(const DramBlockWindowTmp& dram_block_window_tmp,
+                                                  const LdsLoadTileDistr&,
+                                                  const array<index_t, 2>& offset = {0, 0}) const
+    {
+
+        constexpr bool is_col_major = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
+
+        using YPerTile = std::conditional_t<is_col_major, number<KPerBlock>, number<MPerBlock>>;
+        using XPerTile = std::conditional_t<is_col_major, number<MPerBlock>, number<KPerBlock>>;
+        // A DRAM tile window for load
+        auto a_copy_dram_window = generate_tuple(
+            [&](auto idx) {
+                if constexpr(Problem::SkipALds == false)
+                {
+                    return make_tile_window(
+                        dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                        make_tuple(YPerTile{}, XPerTile{}),
+                        dram_block_window_tmp[number<idx>{}].get_window_origin() + offset,
+                        Policy::template MakeADramTileDistribution<Problem>());
+                }
+                else
+                {
+                    return make_tile_window(
+                        dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                        make_tuple(YPerTile{}, XPerTile{}),
+                        dram_block_window_tmp[number<idx>{}].get_window_origin() + offset,
+                        LdsLoadTileDistr{});
+                }
+            },
+            number<DramBlockWindowTmp::size()>{});
+        return std::move(a_copy_dram_window);
+    }
+
+    template <typename DramBlockWindowTmp,
+              typename LdsLoadTileDistr,
+              typename std::enable_if_t<!is_detected<is_tuple, DramBlockWindowTmp>::value, bool>* =
+                  nullptr>
+    CK_TILE_DEVICE constexpr auto CopyADramWindow(const DramBlockWindowTmp& dram_block_window_tmp,
+                                                  const LdsLoadTileDistr&,
+                                                  const array<index_t, 2>& offset = {0, 0}) const
     {
         constexpr bool is_col_major = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
 
         using YPerTile = std::conditional_t<is_col_major, number<KPerBlock>, number<MPerBlock>>;
         using XPerTile = std::conditional_t<is_col_major, number<MPerBlock>, number<KPerBlock>>;
-
         // A DRAM tile window for load
         decltype(auto) a_copy_dram_window = [&] {
             if constexpr(Problem::SkipALds == false)
             {
-                return make_tile_window(a_dram_block_window.get_bottom_tensor_view(),
+                return make_tile_window(dram_block_window_tmp.get_bottom_tensor_view(),
                                         make_tuple(YPerTile{}, XPerTile{}),
-                                        a_dram_block_window.get_window_origin() + offset,
+                                        dram_block_window_tmp.get_window_origin() + offset,
                                         Policy::template MakeADramTileDistribution<Problem>());
             }
             else
             {
-                return make_tile_window(a_dram_block_window.get_bottom_tensor_view(),
+                return make_tile_window(dram_block_window_tmp.get_bottom_tensor_view(),
                                         make_tuple(YPerTile{}, XPerTile{}),
-                                        a_dram_block_window.get_window_origin() + offset,
-                                        ALdsLoadTileDistr{});
+                                        dram_block_window_tmp.get_window_origin() + offset,
+                                        LdsLoadTileDistr{});
             }
         }();
 
+        return std::move(a_copy_dram_window);
+    }
+
+    template <typename DramBlockWindowTmp,
+              typename LdsLoadTileDistr,
+              typename std::enable_if_t<is_detected<is_tuple, DramBlockWindowTmp>::value, bool>* =
+                  nullptr>
+    CK_TILE_DEVICE constexpr auto CopyBDramWindow(const DramBlockWindowTmp& dram_block_window_tmp,
+                                                  const LdsLoadTileDistr&,
+                                                  const array<index_t, 2>& offset = {0, 0}) const
+    {
+
+        constexpr bool is_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
+
+        using YPerTile = std::conditional_t<is_row_major, number<KPerBlock>, number<NPerBlock>>;
+        using XPerTile = std::conditional_t<is_row_major, number<NPerBlock>, number<KPerBlock>>;
+        // A DRAM tile window for load
+        auto a_copy_dram_window = generate_tuple(
+            [&](auto idx) {
+                if constexpr(Problem::SkipBLds == false)
+                {
+                    return make_tile_window(
+                        dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                        make_tuple(YPerTile{}, XPerTile{}),
+                        dram_block_window_tmp[number<idx>{}].get_window_origin() + offset,
+                        Policy::template MakeBDramTileDistribution<Problem>());
+                }
+                else
+                {
+                    return make_tile_window(
+                        dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                        make_tuple(YPerTile{}, XPerTile{}),
+                        dram_block_window_tmp[number<idx>{}].get_window_origin() + offset,
+                        LdsLoadTileDistr{});
+                }
+            },
+            number<DramBlockWindowTmp::size()>{});
+        return std::move(a_copy_dram_window);
+    }
+
+    template <typename DramBlockWindowTmp,
+              typename LdsLoadTileDistr,
+              typename std::enable_if_t<!is_detected<is_tuple, DramBlockWindowTmp>::value, bool>* =
+                  nullptr>
+    CK_TILE_DEVICE constexpr auto CopyBDramWindow(const DramBlockWindowTmp& dram_block_window_tmp,
+                                                  const LdsLoadTileDistr&,
+                                                  const array<index_t, 2>& offset = {0, 0}) const
+    {
+        constexpr bool is_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
+
+        using YPerTile = std::conditional_t<is_row_major, number<KPerBlock>, number<NPerBlock>>;
+        using XPerTile = std::conditional_t<is_row_major, number<NPerBlock>, number<KPerBlock>>;
+        // A DRAM tile window for load
+        decltype(auto) b_copy_dram_window = [&]() {
+            if constexpr(Problem::SkipBLds == false)
+            {
+                make_tile_window(dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(YPerTile{}, XPerTile{}),
+                                 dram_block_window_tmp.get_window_origin() + offset,
+                                 Policy::template MakeBDramTileDistribution<Problem>());
+            }
+            else
+            {
+                make_tile_window(dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(YPerTile{}, XPerTile{}),
+                                 dram_block_window_tmp.get_window_origin() + offset,
+                                 LdsLoadTileDistr{});
+            }
+        }();
+
+        return std::move(b_copy_dram_window);
+    }
+
+    template <typename ADramBlockWindowTmp, typename ALdsTensorView, typename ALdsLoadTileDistr>
+    CK_TILE_DEVICE constexpr auto GetAWindows(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                              const ALdsTensorView& a_lds_block_view,
+                                              const ALdsLoadTileDistr&,
+                                              const array<index_t, 2>& offset = {0, 0}) const
+    {
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            CopyADramWindow(a_dram_block_window_tmp, ALdsLoadTileDistr{}, offset);
+
         // A LDS tile window for store
         auto a_lds_shape = []() {
             if constexpr(is_a_load_tr)
@@ -147,32 +280,14 @@ struct GemmPipelineAgBgCrImplBase
     }
 
     template <typename BDramBlockWindow, typename BLdsTensorView, typename BLdsLoadTileDistr>
-    CK_TILE_DEVICE constexpr auto GetBWindows(const BDramBlockWindow& b_dram_block_window,
+    CK_TILE_DEVICE constexpr auto GetBWindows(const BDramBlockWindow& b_dram_block_window_tmp,
                                               const BLdsTensorView& b_lds_block_view,
                                               const BLdsLoadTileDistr&,
                                               const array<index_t, 2>& offset = {0, 0}) const
     {
-        constexpr bool is_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
-
-        using YPerTile = std::conditional_t<is_row_major, number<KPerBlock>, number<NPerBlock>>;
-        using XPerTile = std::conditional_t<is_row_major, number<NPerBlock>, number<KPerBlock>>;
-
-        decltype(auto) b_copy_dram_window = [&] {
-            if constexpr(Problem::SkipBLds == false)
-            {
-                return make_tile_window(b_dram_block_window.get_bottom_tensor_view(),
-                                        make_tuple(YPerTile{}, XPerTile{}),
-                                        b_dram_block_window.get_window_origin() + offset,
-                                        Policy::template MakeBDramTileDistribution<Problem>());
-            }
-            else
-            {
-                return make_tile_window(b_dram_block_window.get_bottom_tensor_view(),
-                                        make_tuple(YPerTile{}, XPerTile{}),
-                                        b_dram_block_window.get_window_origin() + offset,
-                                        BLdsLoadTileDistr{});
-            }
-        }();
+        // A DRAM tile window for load
+        auto b_copy_dram_window =
+            CopyBDramWindow(b_dram_block_window_tmp, BLdsLoadTileDistr{}, offset);
 
         // TODO: Do we really need those two tile windows???
         // They're exactly same...
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index 5f4ee8987e..2b0b2e8488 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -107,14 +107,23 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     using Base             = BaseGemmPipelineAgBgCrCompV3<Problem>;
     using PipelineImplBase = GemmPipelineAgBgCrImplBase<Problem, Policy>;
 
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using AsDataType = remove_cvref_t<typename Problem::AsDataTypeTuple>;
+    using BsDataType = remove_cvref_t<typename Problem::BsDataTypeTuple>;
+    using CDataType  = remove_cvref_t<typename Problem::CDataType>;
+
+    using AElementWise   = remove_cvref_t<typename Problem::AElementWise>;
+    using BElementWise   = remove_cvref_t<typename Problem::BElementWise>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
-    using ALayout = remove_cvref_t<typename Problem::ALayout>;
-    using BLayout = remove_cvref_t<typename Problem::BLayout>;
-    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+    using AsLayout = remove_cvref_t<typename Problem::AsLayoutTuple>;
+    using BsLayout = remove_cvref_t<typename Problem::BsLayoutTuple>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using ALayout = remove_cvref_t<std::tuple_element_t<0, AsLayout>>;
+    using BLayout = remove_cvref_t<std::tuple_element_t<0, BsLayout>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
 
     using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
     using I0        = number<0>;
@@ -386,17 +395,25 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
 
         template <bool HasHotLoop,
                   TailNumber TailNum,
-                  typename ADramBlockWindowTmp,
-                  typename BDramBlockWindowTmp,
+                  typename AsDramBlockWindowTmp,
+                  typename BsDramBlockWindowTmp,
                   typename AElementFunction,
-                  typename BElementFunction>
-        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                  typename BElementFunction,
+                  typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                                is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                            bool>* = nullptr>
+        CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                        const AElementFunction& a_element_func,
-                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                        const BElementFunction& b_element_func,
                                        index_t num_loop,
                                        void* p_smem) const
         {
+            using ADramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, AsDramBlockWindowTmp>>;
+            using BDramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, BsDramBlockWindowTmp>>;
+
             static_assert(
                 std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
                     std::is_same_v<BDataType,
@@ -449,17 +466,6 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             auto block_gemm   = BlockGemm();
             auto c_block_tile = block_gemm.MakeCBlockTile();
 
-            using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
-            using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
-
-            using ABlockTile =
-                decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
-            using BBlockTile =
-                decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
-
-            ABlockTile a_block_tile;
-            BBlockTile b_block_tile;
-
             using ADramTileWindowStep = typename ADramBlockWindowTmp::BottomTensorIndex;
             using BDramTileWindowStep = typename BDramBlockWindowTmp::BottomTensorIndex;
 
@@ -470,41 +476,58 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
 
             // -----------------------------------------------------------------------------------------
             // Gemm pipeline start
-
-            // prefetch
-            // global read 0
-            Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
-            Base::GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step);
-
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
+            // Load tile — during value loading, an elementwise function is executed for each A0,
+            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            auto elementwise_As_res =
+                load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+
+            // Move each A — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+            // Load tile — during value loading, an elementwise function is executed for each B0,
+            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            auto elementwise_Bs_res =
+                load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+
+            // Move each B — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
+
             // LDS write 0
             if constexpr(is_a_col_major && !is_a_load_tr_v())
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
-                transpose_tile2d(a_shuffle_tmp, a_block_tile);
-                Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
             }
             else
             {
-                Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
+                Base::LocalPrefill(a_copy_lds_window, elementwise_As_res);
             }
             if constexpr(is_b_row_major && !is_b_load_tr_v())
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                transpose_tile2d(b_shuffle_tmp, b_block_tile);
-                Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
             }
             else
             {
-                Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
+                Base::LocalPrefill(b_copy_lds_window, elementwise_Bs_res);
             }
 
-            Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
-            Base::GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step);
+            // global read 1
+
+            elementwise_As_res = load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+            move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+            elementwise_Bs_res = load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+            move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
 
             block_sync_lds();
             block_gemm.LocalPrefetch(
@@ -524,27 +547,32 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
                     {
                         auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                             Policy::template MakeShuffledARegTileDistribution<Problem>());
-                        transpose_tile2d(a_shuffle_tmp, a_block_tile);
-                        Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                        transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                        Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
                     }
                     else
                     {
-                        Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
+                        Base::LocalPrefill(a_copy_lds_window, elementwise_As_res);
                     }
                     if constexpr(is_b_row_major && !is_b_load_tr_v())
                     {
                         auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                             Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                        transpose_tile2d(b_shuffle_tmp, b_block_tile);
-                        Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                        transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                        Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
                     }
                     else
                     {
-                        Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
+                        Base::LocalPrefill(b_copy_lds_window, elementwise_Bs_res);
                     }
 
-                    Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
-                    Base::GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step);
+                    elementwise_As_res =
+                        load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+                    move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+                    elementwise_Bs_res =
+                        load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+                    move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
 
                     block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
 
@@ -570,27 +598,27 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
                 block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
                 block_sync_lds();
 
-                if constexpr(is_a_col_major)
+                if constexpr(is_a_col_major && !is_a_load_tr_v())
                 {
                     auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                         Policy::template MakeShuffledARegTileDistribution<Problem>());
-                    transpose_tile2d(a_shuffle_tmp, a_block_tile);
-                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                    transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
                 }
                 else
                 {
-                    Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
+                    Base::LocalPrefill(a_copy_lds_window, elementwise_As_res);
                 }
-                if constexpr(is_b_row_major)
+                if constexpr(is_b_row_major && !is_b_load_tr_v())
                 {
                     auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                         Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                    transpose_tile2d(b_shuffle_tmp, b_block_tile);
-                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                    transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
                 }
                 else
                 {
-                    Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
+                    Base::LocalPrefill(b_copy_lds_window, elementwise_Bs_res);
                 }
                 block_sync_lds();
                 block_gemm.LocalPrefetch(
@@ -602,13 +630,16 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
         }
     };
 
-    template <typename ADramBlockWindowTmp,
-              typename BDramBlockWindowTmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
               typename AElementFunction,
-              typename BElementFunction>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+              typename BElementFunction,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                    const AElementFunction& a_element_func,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    const BElementFunction& b_element_func,
                                    index_t num_loop,
                                    void* p_smem) const
@@ -628,9 +659,13 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
      * @note This is used by the persistent gemm kernel variants that don't determine
      *       hot loop and tail number on the host side, e.g. grouped gemm kernel.
      */
-    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    index_t num_loop,
                                    bool has_hot_loop,
                                    TailNumber tail_number,
@@ -639,7 +674,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
         const auto RunPipeline = [&](auto hot_loop_, auto tail_num_) {
             constexpr bool hot_loop    = hot_loop_.value;
             constexpr auto tail_num    = tail_num_.value;
-            constexpr auto PassThrough = [](const auto& x) { return x; };
+            constexpr auto PassThrough = [](auto& e, const auto& x) { e = x; };
             return PipelineImpl<Scheduler>{}.template operator()<hot_loop, tail_num>(
                 a_dram_block_window_tmp,
                 PassThrough,
@@ -658,20 +693,97 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
      * @note This is used by the kernel variants that are able to determine
      *       hot loop and tail number on the host side, e.g. non-persistent gemm kernel.
      */
-    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    index_t num_loop,
                                    void* p_smem) const
     {
         return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
             a_dram_block_window_tmp,
-            [](const ADataType& a) { return a; },
+            [](auto& e, const ADataType& a) { e = a; },
             b_dram_block_window_tmp,
-            [](const BDataType& b) { return b; },
+            [](auto& e, const BDataType& b) { e = b; },
             num_loop,
             p_smem);
     }
+
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction,
+              typename std::enable_if_t<!is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BElementFunction& b_element_func,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          a_element_func,
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          b_element_func,
+                          num_loop,
+                          p_smem);
+    }
+
+    /**
+     * @brief Quant operator(), single input: This function runs the pipeline by wrapping it with
+     * the tail handler.
+     *
+     * @note This is used by the persistent gemm kernel variants that don't determine
+     *       hot loop and tail number on the host side, e.g. grouped gemm kernel.
+     */
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   bool has_hot_loop,
+                                   TailNumber tail_number,
+                                   void* p_smem) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          num_loop,
+                          has_hot_loop,
+                          tail_number,
+                          p_smem);
+    }
+
+    /**
+     * @brief Quant operator(), single input: This function runs the pipeline using compile-time
+     * known hot loop and tail number.
+     * @param num_loop The number of loop iterations. This is determined at runtime due to e.g.
+     * SplitK.
+     * @note This is used by the kernel variants that are able to determine
+     *       hot loop and tail number on the host side, e.g. non-persistent gemm kernel.
+     */
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          num_loop,
+                          p_smem);
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index c835809b5d..d0466bc8b1 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -97,11 +97,24 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     using Base             = BaseGemmPipelineAgBgCrCompV4<Problem>;
     using PipelineImplBase = GemmPipelineAgBgCrImplBase<Problem, Policy>;
 
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using AsDataType     = remove_cvref_t<typename Problem::AsDataTypeTuple>;
+    using BsDataType     = remove_cvref_t<typename Problem::BsDataTypeTuple>;
     using CDataType      = remove_cvref_t<typename Problem::CDataType>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
+    using AsLayout = remove_cvref_t<typename Problem::AsLayoutTuple>;
+    using BsLayout = remove_cvref_t<typename Problem::BsLayoutTuple>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using AElementWise = remove_cvref_t<typename Problem::AElementWise>;
+    using BElementWise = remove_cvref_t<typename Problem::BElementWise>;
+
+    using ALayout = remove_cvref_t<std::tuple_element_t<0, AsLayout>>;
+    using BLayout = remove_cvref_t<std::tuple_element_t<0, BsLayout>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
+
     static_assert(!std::is_same_v<BDataType, pk_int4_t>, "Not implemented");
 
     static constexpr index_t APackedSize =
@@ -109,10 +122,6 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     static constexpr index_t BPackedSize =
         ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
 
-    using ALayout = remove_cvref_t<typename Problem::ALayout>;
-    using BLayout = remove_cvref_t<typename Problem::BLayout>;
-    using CLayout = remove_cvref_t<typename Problem::CLayout>;
-
     using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
     using I0        = number<0>;
     using I1        = number<1>;
@@ -244,18 +253,26 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
 
         template <bool HasHotLoop,
                   TailNumber TailNum,
-                  typename ADramBlockWindowTmp,
-                  typename BDramBlockWindowTmp,
+                  typename AsDramBlockWindowTmp,
+                  typename BsDramBlockWindowTmp,
                   typename AElementFunction,
-                  typename BElementFunction>
-        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                  typename BElementFunction,
+                  typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                                is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                            bool>* = nullptr>
+        CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                        const AElementFunction& a_element_func,
-                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                        const BElementFunction& b_element_func,
                                        index_t num_loop,
                                        void* __restrict__ p_smem_0,
                                        void* __restrict__ p_smem_1) const
         {
+            using ADramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, AsDramBlockWindowTmp>>;
+            using BDramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, BsDramBlockWindowTmp>>;
+
             static_assert(
                 std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
                     std::is_same_v<BDataType,
@@ -279,29 +296,6 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                                  KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}]),
                           "B block window has incorrect lengths for defined BLayout!");
 
-            ////////////// global window & register /////////////////
-            // A DRAM tile window for load
-            auto a_copy_dram_window =
-                make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
-                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                 a_dram_block_window_tmp.get_window_origin(),
-                                 Policy::template MakeADramTileDistribution<Problem>());
-
-            // B DRAM tile window for load
-            auto b_copy_dram_window =
-                make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
-                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                 b_dram_block_window_tmp.get_window_origin(),
-                                 Policy::template MakeBDramTileDistribution<Problem>());
-
-            // A register tile for global load
-            constexpr auto ABlockTileDistr = a_copy_dram_window.get_tile_distribution();
-            constexpr auto BBlockTileDistr = b_copy_dram_window.get_tile_distribution();
-            using ABlockTile = decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr));
-            using BBlockTile = decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr));
-            ABlockTile a_global_load_tile;
-            BBlockTile b_global_load_tile;
-
             using ADramTileWindowStep = typename ADramBlockWindowTmp::BottomTensorIndex;
             using BDramTileWindowStep = typename BDramBlockWindowTmp::BottomTensorIndex;
 
@@ -312,8 +306,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
 
             // global prefetch 0
             // global read 0
-            Base::GlobalPrefetch(a_global_load_tile, a_copy_dram_window, a_dram_tile_window_step);
-            Base::GlobalPrefetch(b_global_load_tile, b_copy_dram_window, b_dram_tile_window_step);
+
             ////////////// LDS desc, window & register /////////////////
             auto&& [a_lds_block0, b_lds_block0] = Base::GetABLdsTensorViews(p_smem_0);
             auto&& [a_lds_block1, b_lds_block1] = Base::GetABLdsTensorViews(p_smem_1);
@@ -343,34 +336,75 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
+            // Generating a tuple with tile_windows for values A0, A1, ... AN
+            auto a_tile_windows = generate_tuple(
+                [&](auto idx) {
+                    return make_tile_window(
+                        a_dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                        make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                        a_dram_block_window_tmp[number<idx>{}].get_window_origin(),
+                        Policy::template MakeADramTileDistribution<Problem>());
+                },
+                number<AsLayout::size()>{});
+
+            // Load tile — during value loading, an elementwise function is executed for each A0,
+            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            auto elementwise_As_res = load_tile_with_elementwise(a_tile_windows, a_element_func);
+
+            // Move each A — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(a_tile_windows, a_dram_tile_window_step);
+
+            // Generating a tuple with tile_windows for values B0, B1, ... BN
+            auto b_tile_windows = generate_tuple(
+                [&](auto idx) {
+                    return make_tile_window(
+                        b_dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                        make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                        b_dram_block_window_tmp[number<idx>{}].get_window_origin(),
+                        Policy::template MakeBDramTileDistribution<Problem>());
+                },
+                number<AsLayout::size()>{});
+
+            // Load tile — during value loading, an elementwise function is executed for each B0,
+            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            auto elementwise_Bs_res = load_tile_with_elementwise(b_tile_windows, b_element_func);
+
+            // Move each B — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(b_tile_windows, b_dram_tile_window_step);
+
             // LDS write 0
             if constexpr(is_a_col_major && !is_a_load_tr_v())
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
-                transpose_tile2d(a_shuffle_tmp, a_global_load_tile);
-                Base::LocalPrefill(a_copy_lds_window0, a_shuffle_tmp, a_element_func);
+                transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                Base::LocalPrefill(a_copy_lds_window0, a_shuffle_tmp);
             }
             else
             {
-                Base::LocalPrefill(a_copy_lds_window0, a_global_load_tile, a_element_func);
+                Base::LocalPrefill(a_copy_lds_window0, elementwise_As_res);
             }
             if constexpr(is_b_row_major && !is_b_load_tr_v())
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                transpose_tile2d(b_shuffle_tmp, b_global_load_tile);
-                Base::LocalPrefill(b_copy_lds_window0, b_shuffle_tmp, b_element_func);
+                transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                Base::LocalPrefill(b_copy_lds_window0, b_shuffle_tmp);
             }
             else
             {
-                Base::LocalPrefill(b_copy_lds_window0, b_global_load_tile, b_element_func);
+                Base::LocalPrefill(b_copy_lds_window0, elementwise_Bs_res);
             }
 
             // global read 1
-            Base::GlobalPrefetch(a_global_load_tile, a_copy_dram_window, a_dram_tile_window_step);
-            Base::GlobalPrefetch(b_global_load_tile, b_copy_dram_window, b_dram_tile_window_step);
 
+            elementwise_As_res = load_tile_with_elementwise(a_tile_windows, a_element_func);
+            move_tile_window(a_tile_windows, a_dram_tile_window_step);
+
+            elementwise_Bs_res = load_tile_with_elementwise(b_tile_windows, b_element_func);
+            move_tile_window(b_tile_windows, b_dram_tile_window_step);
             block_sync_lds();
 
             constexpr auto ALdsTileDistr =
@@ -423,32 +457,37 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
-                transpose_tile2d(a_shuffle_tmp, a_global_load_tile);
-                Base::LocalPrefill(a_copy_lds_window1, a_shuffle_tmp, a_element_func);
+                transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                Base::LocalPrefill(a_copy_lds_window1, a_shuffle_tmp);
             }
             else
             {
-                Base::LocalPrefill(a_copy_lds_window1, a_global_load_tile, a_element_func);
+                Base::LocalPrefill(a_copy_lds_window1, elementwise_As_res);
             }
             if constexpr(is_b_row_major && !is_b_load_tr_v())
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                transpose_tile2d(b_shuffle_tmp, b_global_load_tile);
-                Base::LocalPrefill(b_copy_lds_window1, b_shuffle_tmp, b_element_func);
+                transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                Base::LocalPrefill(b_copy_lds_window1, b_shuffle_tmp);
             }
             else
             {
-                Base::LocalPrefill(b_copy_lds_window1, b_global_load_tile, b_element_func);
+                Base::LocalPrefill(b_copy_lds_window1, elementwise_Bs_res);
             }
 
-            Base::GlobalPrefetch(a_global_load_tile, a_copy_dram_window, a_dram_tile_window_step);
-            Base::GlobalPrefetch(b_global_load_tile, b_copy_dram_window, b_dram_tile_window_step);
+            // Load tile — during value loading, an elementwise function is executed for each A0,
+            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            elementwise_As_res = load_tile_with_elementwise(a_tile_windows, a_element_func);
+            move_tile_window(a_tile_windows, a_dram_tile_window_step);
+
+            elementwise_Bs_res = load_tile_with_elementwise(b_tile_windows, b_element_func);
+            move_tile_window(b_tile_windows, b_dram_tile_window_step);
 
             if(HasHotLoop)
             {
                 // minus 2 because we have ping-pong double buffer.
-                index_t iCounter = __builtin_amdgcn_readfirstlane(num_loop - 2);
+                index_t iCounter = amd_wave_read_first_lane(num_loop - 2);
                 do
                 {
                     // ping
@@ -461,31 +500,32 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                         {
                             auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                                 Policy::template MakeShuffledARegTileDistribution<Problem>());
-                            transpose_tile2d(a_shuffle_tmp, a_global_load_tile);
-                            Base::LocalPrefill(a_copy_lds_window0, a_shuffle_tmp, a_element_func);
+                            transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                            Base::LocalPrefill(a_copy_lds_window0, a_shuffle_tmp);
                         }
                         else
                         {
-                            Base::LocalPrefill(
-                                a_copy_lds_window0, a_global_load_tile, a_element_func);
+                            Base::LocalPrefill(a_copy_lds_window0, elementwise_As_res);
                         }
                         if constexpr(is_b_row_major && !is_b_load_tr_v())
                         {
                             auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                                 Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                            transpose_tile2d(b_shuffle_tmp, b_global_load_tile);
-                            Base::LocalPrefill(b_copy_lds_window0, b_shuffle_tmp, b_element_func);
+                            transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                            Base::LocalPrefill(b_copy_lds_window0, b_shuffle_tmp);
                         }
                         else
                         {
-                            Base::LocalPrefill(
-                                b_copy_lds_window0, b_global_load_tile, b_element_func);
+                            Base::LocalPrefill(b_copy_lds_window0, elementwise_Bs_res);
                         }
 
-                        Base::GlobalPrefetch(
-                            a_global_load_tile, a_copy_dram_window, a_dram_tile_window_step);
-                        Base::GlobalPrefetch(
-                            b_global_load_tile, b_copy_dram_window, b_dram_tile_window_step);
+                        elementwise_As_res =
+                            load_tile_with_elementwise(a_tile_windows, a_element_func);
+                        move_tile_window(a_tile_windows, a_dram_tile_window_step);
+
+                        elementwise_Bs_res =
+                            load_tile_with_elementwise(b_tile_windows, b_element_func);
+                        move_tile_window(b_tile_windows, b_dram_tile_window_step);
                         // gemm
                         block_gemm(c_block_tile, a_block_tile0, b_block_tile0);
                         HotLoopScheduler();
@@ -501,32 +541,34 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                         {
                             auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                                 Policy::template MakeShuffledARegTileDistribution<Problem>());
-                            transpose_tile2d(a_shuffle_tmp, a_global_load_tile);
-                            Base::LocalPrefill(a_copy_lds_window1, a_shuffle_tmp, a_element_func);
+                            transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                            Base::LocalPrefill(a_copy_lds_window1, a_shuffle_tmp);
                         }
                         else
                         {
-                            Base::LocalPrefill(
-                                a_copy_lds_window1, a_global_load_tile, a_element_func);
+                            Base::LocalPrefill(a_copy_lds_window1, elementwise_As_res);
                         }
                         if constexpr(is_b_row_major && !is_b_load_tr_v())
                         {
                             auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                                 Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                            transpose_tile2d(b_shuffle_tmp, b_global_load_tile);
-                            Base::LocalPrefill(b_copy_lds_window1, b_shuffle_tmp, b_element_func);
+                            transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                            Base::LocalPrefill(b_copy_lds_window1, b_shuffle_tmp);
                         }
                         else
                         {
-                            Base::LocalPrefill(
-                                b_copy_lds_window1, b_global_load_tile, b_element_func);
+                            Base::LocalPrefill(b_copy_lds_window1, elementwise_Bs_res);
                         }
                         block_sync_lds();
 
-                        Base::GlobalPrefetch(
-                            a_global_load_tile, a_copy_dram_window, a_dram_tile_window_step);
-                        Base::GlobalPrefetch(
-                            b_global_load_tile, b_copy_dram_window, b_dram_tile_window_step);
+                        elementwise_As_res =
+                            load_tile_with_elementwise(a_tile_windows, a_element_func);
+                        move_tile_window(a_tile_windows, a_dram_tile_window_step);
+
+                        elementwise_Bs_res =
+                            load_tile_with_elementwise(b_tile_windows, b_element_func);
+                        move_tile_window(b_tile_windows, b_dram_tile_window_step);
+
                         // gemm
                         block_gemm(c_block_tile, a_block_tile1, b_block_tile1);
                         HotLoopScheduler();
@@ -548,23 +590,23 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                     {
                         auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                             Policy::template MakeShuffledARegTileDistribution<Problem>());
-                        transpose_tile2d(a_shuffle_tmp, a_global_load_tile);
-                        Base::LocalPrefill(a_copy_lds_window0, a_shuffle_tmp, a_element_func);
+                        transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                        Base::LocalPrefill(a_copy_lds_window0, a_shuffle_tmp);
                     }
                     else
                     {
-                        Base::LocalPrefill(a_copy_lds_window0, a_global_load_tile, a_element_func);
+                        Base::LocalPrefill(a_copy_lds_window0, elementwise_As_res);
                     }
                     if constexpr(is_b_row_major && !is_b_load_tr_v())
                     {
                         auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                             Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                        transpose_tile2d(b_shuffle_tmp, b_global_load_tile);
-                        Base::LocalPrefill(b_copy_lds_window0, b_shuffle_tmp, b_element_func);
+                        transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                        Base::LocalPrefill(b_copy_lds_window0, b_shuffle_tmp);
                     }
                     else
                     {
-                        Base::LocalPrefill(b_copy_lds_window0, b_global_load_tile, b_element_func);
+                        Base::LocalPrefill(b_copy_lds_window0, elementwise_Bs_res);
                     }
                     block_gemm(c_block_tile, a_block_tile0, b_block_tile0);
                 }
@@ -606,13 +648,17 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
         }
     };
 
-    template <typename ADramBlockWindowTmp,
-              typename BDramBlockWindowTmp,
+    public:
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
               typename AElementFunction,
-              typename BElementFunction>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+              typename BElementFunction,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                    const AElementFunction& a_element_func,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    const BElementFunction& b_element_func,
                                    index_t num_loop,
                                    void* p_smem_0,
@@ -628,27 +674,34 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             p_smem_1);
     }
 
-    public:
-    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    const index_t num_loop,
                                    void* __restrict__ p_smem_0,
                                    void* __restrict__ p_smem_1) const
     {
         return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
             a_dram_block_window_tmp,
-            [](const ADataType& a) { return a; },
+            [](auto& e, const ADataType& a) { e = a; },
             b_dram_block_window_tmp,
-            [](const BDataType& b) { return b; },
+            [](auto& e, const BDataType& b) { e = b; },
             num_loop,
             p_smem_0,
             p_smem_1);
     }
 
-    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    index_t num_loop,
                                    bool has_hot_loop,
                                    TailNumber tail_number,
@@ -658,7 +711,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
         const auto RunPipeline = [&](auto hot_loop_, auto tail_num_) {
             constexpr bool hot_loop    = hot_loop_.value;
             constexpr auto tail_num    = tail_num_.value;
-            constexpr auto PassThrough = [](const auto& x) { return x; };
+            constexpr auto PassThrough = [](auto& e, const auto& x) { e = x; };
             return PipelineImpl<Scheduler>{}.template operator()<hot_loop, tail_num>(
                 a_dram_block_window_tmp,
                 PassThrough,
@@ -670,5 +723,69 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
         };
         return Base::TailHandler(RunPipeline, has_hot_loop, tail_number);
     }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BElementFunction& b_element_func,
+                                   index_t num_loop,
+                                   void* p_smem_0,
+                                   void* p_smem_1) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          a_element_func,
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          b_element_func,
+                          num_loop,
+                          p_smem_0,
+                          p_smem_1);
+    }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const index_t num_loop,
+                                   void* __restrict__ p_smem_0,
+                                   void* __restrict__ p_smem_1) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          num_loop,
+                          p_smem_0,
+                          p_smem_1);
+    }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   bool has_hot_loop,
+                                   TailNumber tail_number,
+                                   void* __restrict__ p_smem_0,
+                                   void* __restrict__ p_smem_1) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          num_loop,
+                          has_hot_loop,
+                          tail_number,
+                          p_smem_0,
+                          p_smem_1);
+    }
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
index a80ed57be5..3164b41cc7 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
@@ -20,17 +20,18 @@ struct GemmPipelineAgBgCrCompV4DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
-        // using AccDataType     = float;
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
 
-        constexpr bool single_load_tr_length =
-            (DS_READ_TR_SIZE() / sizeof(typename Problem::ComputeDataType)) ==
-            (WarpTile::at(I1) * WarpTile::at(I2) / get_warp_size());
+        constexpr index_t vector_size =
+            DS_READ_TR_SIZE() / sizeof(typename Problem::ComputeDataType);
+        constexpr index_t thread_elements = WarpTile::at(I1) * WarpTile::at(I2) / get_warp_size();
         constexpr auto wg_attr_num_access =
-            ((is_a_load_tr<Problem> || is_b_load_tr<Problem>) && !single_load_tr_length)
-                ? WGAttrNumAccessEnum::Double
-                : WGAttrNumAccessEnum::Single;
+            !(is_a_load_tr<Problem> || is_b_load_tr<Problem>) ? WGAttrNumAccessEnum::Single
+            : vector_size == thread_elements                  ? WGAttrNumAccessEnum::Single
+            : vector_size * 2 == thread_elements              ? WGAttrNumAccessEnum::Double
+            : vector_size * 4 == thread_elements              ? WGAttrNumAccessEnum::Quad
+                                                              : WGAttrNumAccessEnum::Invalid;
 
         using WarpGemm = WarpGemmDispatcher<typename Problem::ADataType,
                                             typename Problem::BDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
index b05145890f..7263ddd5a1 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
@@ -38,15 +41,24 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
     using Base             = BaseGemmPipelineAgBgCrCompV5<Problem>;
     using PipelineImplBase = GemmPipelineAgBgCrImplBase<Problem, Policy>;
 
-    using ADataType       = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType       = remove_cvref_t<typename Problem::BDataType>;
+    using AsDataType      = remove_cvref_t<typename Problem::AsDataTypeTuple>;
+    using BsDataType      = remove_cvref_t<typename Problem::BsDataTypeTuple>;
     using CDataType       = remove_cvref_t<typename Problem::CDataType>;
     using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
     using BlockGemmShape  = remove_cvref_t<typename Problem::BlockGemmShape>;
 
-    using ALayout = remove_cvref_t<typename Problem::ALayout>;
-    using BLayout = remove_cvref_t<typename Problem::BLayout>;
-    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+    using AElementWise = remove_cvref_t<typename Problem::AElementWise>;
+    using BElementWise = remove_cvref_t<typename Problem::BElementWise>;
+
+    using AsLayout = remove_cvref_t<typename Problem::AsLayoutTuple>;
+    using BsLayout = remove_cvref_t<typename Problem::BsLayoutTuple>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using ALayout = remove_cvref_t<std::tuple_element_t<0, AsLayout>>;
+    using BLayout = remove_cvref_t<std::tuple_element_t<0, BsLayout>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
 
     static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
@@ -118,17 +130,25 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
 
         template <bool HasHotLoop,
                   TailNumber TailNum,
-                  typename ADramBlockWindowTmp,
+                  typename AsDramBlockWindowTmp,
                   typename AElementFunction,
-                  typename BDramBlockWindowTmp,
-                  typename BElementFunction>
-        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                  typename BsDramBlockWindowTmp,
+                  typename BElementFunction,
+                  typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                                is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                            bool>* = nullptr>
+        CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                        const AElementFunction& a_element_func,
-                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                        const BElementFunction& b_element_func,
                                        index_t num_loop,
                                        void* __restrict__ p_smem_0) const
         {
+            using ADramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, AsDramBlockWindowTmp>>;
+            using BDramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, BsDramBlockWindowTmp>>;
+
             static_assert(
                 std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
                     std::is_same_v<BDataType,
@@ -158,7 +178,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
 
             index_t warp_id = get_warp_id();
             index_t operation_id =
-                __builtin_amdgcn_readfirstlane(get_warp_id()); // 0 - Memory read, 1 - block-gemm
+                amd_wave_read_first_lane(get_warp_id()); // 0 - Memory read, 1 - block-gemm
 
             auto a_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock);
             auto b_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock);
@@ -206,14 +226,16 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
             BGemmTile b_tile_0, b_tile_1;
 
             // Register tile for A and B.
-            using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
-            using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+            using ABlockTileDistr =
+                decltype(a_copy_dram_window[number<0>{}].get_tile_distribution());
+            using BBlockTileDistr =
+                decltype(b_copy_dram_window[number<0>{}].get_tile_distribution());
             using ABlockTile =
                 decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
             using BBlockTile =
                 decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
-            ABlockTile a_global_load_tile;
-            BBlockTile b_global_load_tile;
+            ABlockTile elementwise_As_res;
+            BBlockTile elementwise_Bs_res;
 
             // Block GEMM
             auto block_gemm     = BlockGemm();
@@ -245,33 +267,45 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
             // define ping, pong steps here as lambda functions.
             auto MemoryOpsStep = [&](auto idx) {
                 // Memory read half here.
-                Base::GlobalPrefetch(
-                    a_global_load_tile, a_copy_dram_window, a_dram_tile_window_step);
-                Base::GlobalPrefetch(
-                    b_global_load_tile, b_copy_dram_window, b_dram_tile_window_step);
+
+                // Load tile — during value loading, an elementwise function is executed for each
+                // A0, A1, … AN. The values A0, A1, … AN are read by the same thread.
+                elementwise_As_res = load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+
+                // Move each A — the enhanced function move_tile_window is executed, which takes a
+                // tuple as input.
+                move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+                // Load tile — during value loading, an elementwise function is executed for each
+                // B0, B1, … BN. The values B0, B1, … BN are read by the same thread.
+                elementwise_Bs_res = load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+
+                // Move each B — the enhanced function move_tile_window is executed, which takes a
+                // tuple as input.
+                move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
 
                 if constexpr(is_a_col_major)
                 {
                     auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                         Policy::template MakeShuffledARegTileDistribution<Problem>());
-                    transpose_tile2d(a_shuffle_tmp, a_global_load_tile);
-                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                    transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
                 }
                 else
                 {
-                    Base::LocalPrefill(a_copy_lds_window, a_global_load_tile, a_element_func);
+                    Base::LocalPrefill(a_copy_lds_window, elementwise_As_res);
                 }
 
                 if constexpr(is_b_row_major)
                 {
                     auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                         Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                    transpose_tile2d(b_shuffle_tmp, b_global_load_tile);
-                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                    transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
                 }
                 else
                 {
-                    Base::LocalPrefill(b_copy_lds_window, b_global_load_tile, b_element_func);
+                    Base::LocalPrefill(b_copy_lds_window, elementwise_Bs_res);
                 }
 
                 if(idx == 0)
@@ -302,7 +336,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
                 MemoryOpsStep(warp_id);
             }
 
-            index_t num_compute_steps = __builtin_amdgcn_readfirstlane(num_loop);
+            index_t num_compute_steps = amd_wave_read_first_lane(num_loop);
             while(num_compute_steps > 1)
             {
                 block_sync_lds();
@@ -348,13 +382,17 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
         }
     };
 
-    template <typename ADramBlockWindowTmp,
-              typename BDramBlockWindowTmp,
+    public:
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
               typename AElementFunction,
-              typename BElementFunction>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+              typename BElementFunction,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                    const AElementFunction& a_element_func,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    const BElementFunction& b_element_func,
                                    index_t num_loop,
                                    void* p_smem_0) const
@@ -368,21 +406,62 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
             p_smem_0);
     }
 
-    public:
-    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    const index_t num_loop,
                                    void* __restrict__ p_smem_0) const
     {
         return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
             a_dram_block_window_tmp,
-            [](const ADataType& a) { return a; },
+            [](auto& e, const ADataType& a) { e = a; },
             b_dram_block_window_tmp,
-            [](const BDataType& b) { return b; },
+            [](auto& e, const BDataType& b) { e = b; },
             num_loop,
             p_smem_0);
     }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BElementFunction& b_element_func,
+                                   index_t num_loop,
+                                   void* p_smem_0) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          a_element_func,
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          b_element_func,
+                          num_loop,
+                          p_smem_0);
+    }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const index_t num_loop,
+                                   void* __restrict__ p_smem_0) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          num_loop,
+                          p_smem_0);
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index 1adee65ff0..8bfddacd0a 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -175,14 +175,23 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
     using Base             = BaseGemmPipelineAgBgCrMem<Problem>;
     using PipelineImplBase = GemmPipelineAgBgCrImplBase<Problem, Policy>;
 
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using AsDataType = remove_cvref_t<typename Problem::AsDataTypeTuple>;
+    using BsDataType = remove_cvref_t<typename Problem::BsDataTypeTuple>;
+    using CDataType  = remove_cvref_t<typename Problem::CDataType>;
+
+    using AElementWise   = remove_cvref_t<typename Problem::AElementWise>;
+    using BElementWise   = remove_cvref_t<typename Problem::BElementWise>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
-    using ALayout = remove_cvref_t<typename Problem::ALayout>;
-    using BLayout = remove_cvref_t<typename Problem::BLayout>;
-    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+    using AsLayout = remove_cvref_t<typename Problem::AsLayoutTuple>;
+    using BsLayout = remove_cvref_t<typename Problem::BsLayoutTuple>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using ALayout = remove_cvref_t<std::tuple_element_t<0, AsLayout>>;
+    using BLayout = remove_cvref_t<std::tuple_element_t<0, BsLayout>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
 
     using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
 
@@ -257,17 +266,25 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
         template <bool HasHotLoop,
                   TailNumber TailNum,
-                  typename ADramBlockWindowTmp,
-                  typename BDramBlockWindowTmp,
+                  typename AsDramBlockWindowTmp,
+                  typename BsDramBlockWindowTmp,
                   typename AElementFunction,
-                  typename BElementFunction>
-        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                  typename BElementFunction,
+                  typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                                is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                            bool>* = nullptr>
+        CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                        const AElementFunction& a_element_func,
-                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                        const BElementFunction& b_element_func,
                                        index_t num_loop,
                                        void* p_smem) const
         {
+            using ADramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, AsDramBlockWindowTmp>>;
+            using BDramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, BsDramBlockWindowTmp>>;
+
             static_assert(
                 std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
                     std::is_same_v<BDataType,
@@ -331,8 +348,10 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             auto block_gemm   = BlockGemm();
             auto c_block_tile = block_gemm.MakeCBlockTile();
 
-            using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
-            using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+            using ABlockTileDistr =
+                decltype(a_copy_dram_window[number<0>{}].get_tile_distribution());
+            using BBlockTileDistr =
+                decltype(b_copy_dram_window[number<0>{}].get_tile_distribution());
 
             using ABlockTile =
                 decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
@@ -389,10 +408,21 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
             // prefetch
             // global read 0
-            Base::GlobalPrefetch(
-                a_block_tiles.get(I0{}), a_copy_dram_window, a_dram_tile_window_step);
-            Base::GlobalPrefetch(
-                b_block_tiles.get(I0{}), b_copy_dram_window, b_dram_tile_window_step);
+            // Load tile — during value loading, an elementwise function is executed for each A0,
+            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            a_block_tiles.at(I0{}) = load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+
+            // Move each A — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+            // Load tile — during value loading, an elementwise function is executed for each B0,
+            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            b_block_tiles.at(I0{}) = load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+
+            // Move each B — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
 
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
@@ -405,11 +435,11 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                     auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                         Policy::template MakeShuffledARegTileDistribution<Problem>());
                     transpose_tile2d(a_shuffle_tmp, a_block_tiles.get(I0{}));
-                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
                 }
                 else
                 {
-                    Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
+                    Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}));
                 }
             }
 
@@ -420,22 +450,25 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                     auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                         Policy::template MakeShuffledBRegTileDistribution<Problem>());
                     transpose_tile2d(b_shuffle_tmp, b_block_tiles.get(I0{}));
-                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
                 }
                 else
                 {
-                    Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func);
+                    Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}));
                 }
             }
 
             // Global prefetch [1, PrefetchStages]
             static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) {
-                Base::GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
-                                     a_copy_dram_window,
-                                     a_dram_tile_window_step);
-                Base::GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
-                                     b_copy_dram_window,
-                                     b_dram_tile_window_step);
+                a_block_tiles.at(number<prefetch_idx>{}) =
+                    load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+
+                move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+                b_block_tiles.at(number<prefetch_idx>{}) =
+                    load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+
+                move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
             });
 
             // main body
@@ -469,16 +502,14 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                     a_shuffle_tmp,
                                     a_block_tiles.get(
                                         number<(prefetch_idx + 1) % PrefetchStages>{}));
-                                Base::LocalPrefill(
-                                    a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                                Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
                             }
                             else
                             {
                                 Base::LocalPrefill(
                                     a_copy_lds_window,
                                     a_block_tiles.get(
-                                        number<(prefetch_idx + 1) % PrefetchStages>{}),
-                                    a_element_func);
+                                        number<(prefetch_idx + 1) % PrefetchStages>{}));
                             }
                         }
 
@@ -492,25 +523,25 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                     b_shuffle_tmp,
                                     b_block_tiles.get(
                                         number<(prefetch_idx + 1) % PrefetchStages>{}));
-                                Base::LocalPrefill(
-                                    b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                                Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
                             }
                             else
                             {
                                 Base::LocalPrefill(
                                     b_copy_lds_window,
                                     b_block_tiles.get(
-                                        number<(prefetch_idx + 1) % PrefetchStages>{}),
-                                    b_element_func);
+                                        number<(prefetch_idx + 1) % PrefetchStages>{}));
                             }
                         }
 
-                        Base::GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
-                                             a_copy_dram_window,
-                                             a_dram_tile_window_step);
-                        Base::GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
-                                             b_copy_dram_window,
-                                             b_dram_tile_window_step);
+                        a_block_tiles.at(number<prefetch_idx>{}) =
+                            load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+                        move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+                        b_block_tiles.at(number<prefetch_idx>{}) =
+                            load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+
+                        move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
                     });
 
                     i += PrefetchStages;
@@ -540,13 +571,12 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                 Policy::template MakeShuffledARegTileDistribution<Problem>());
                             transpose_tile2d(a_shuffle_tmp,
                                              a_block_tiles.get(number<prefetch_idx>{}));
-                            Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                            Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
                         }
                         else
                         {
                             Base::LocalPrefill(a_copy_lds_window,
-                                               a_block_tiles.get(number<prefetch_idx>{}),
-                                               a_element_func);
+                                               a_block_tiles.get(number<prefetch_idx>{}));
                         }
                     }
 
@@ -558,13 +588,12 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                 Policy::template MakeShuffledBRegTileDistribution<Problem>());
                             transpose_tile2d(b_shuffle_tmp,
                                              b_block_tiles.get(number<prefetch_idx>{}));
-                            Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                            Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
                         }
                         else
                         {
                             Base::LocalPrefill(b_copy_lds_window,
-                                               b_block_tiles.get(number<prefetch_idx>{}),
-                                               b_element_func);
+                                               b_block_tiles.get(number<prefetch_idx>{}));
                         }
                     }
                 });
@@ -634,17 +663,25 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
         template <bool HasHotLoop,
                   TailNumber TailNum,
-                  typename ADramBlockWindowTmp,
-                  typename BDramBlockWindowTmp,
+                  typename AsDramBlockWindowTmp,
+                  typename BsDramBlockWindowTmp,
                   typename AElementFunction,
-                  typename BElementFunction>
-        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                  typename BElementFunction,
+                  typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                                is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                            bool>* = nullptr>
+        CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                        const AElementFunction& a_element_func,
-                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                        const BElementFunction& b_element_func,
                                        index_t num_loop,
                                        void* p_smem) const
         {
+            using ADramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, AsDramBlockWindowTmp>>;
+            using BDramBlockWindowTmp =
+                remove_cvref_t<std::tuple_element_t<number<0>{}, BsDramBlockWindowTmp>>;
+
             static_assert(
                 std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
                     std::is_same_v<BDataType,
@@ -708,8 +745,10 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             auto block_gemm   = BlockGemm();
             auto c_block_tile = block_gemm.MakeCBlockTile();
 
-            using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
-            using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+            using ABlockTileDistr =
+                decltype(a_copy_dram_window[number<0>{}].get_tile_distribution());
+            using BBlockTileDistr =
+                decltype(b_copy_dram_window[number<0>{}].get_tile_distribution());
 
             using ABlockTile =
                 decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
@@ -754,10 +793,22 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
             // prefetch
             // global read 0
-            Base::GlobalPrefetch(
-                a_block_tiles.get(I0{}), a_copy_dram_window, a_dram_tile_window_step);
-            Base::GlobalPrefetch(
-                b_block_tiles.get(I0{}), b_copy_dram_window, b_dram_tile_window_step);
+
+            // Load tile — during value loading, an elementwise function is executed for each A0,
+            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            a_block_tiles.at(I0{}) = load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+
+            // Move each A — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+            // Load tile — during value loading, an elementwise function is executed for each B0,
+            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            b_block_tiles.at(I0{}) = load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+
+            // Move each B — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
 
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
@@ -770,11 +821,11 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                     auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                         Policy::template MakeShuffledARegTileDistribution<Problem>());
                     transpose_tile2d(a_shuffle_tmp, a_block_tiles.get(I0{}));
-                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
                 }
                 else
                 {
-                    Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
+                    Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}));
                 }
             }
 
@@ -785,11 +836,11 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                     auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                         Policy::template MakeShuffledBRegTileDistribution<Problem>());
                     transpose_tile2d(b_shuffle_tmp, b_block_tiles.get(I0{}));
-                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
                 }
                 else
                 {
-                    Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func);
+                    Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}));
                 }
             }
 
@@ -800,12 +851,15 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
             // Global prefetch [1, PrefetchStages]
             static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) {
-                Base::GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
-                                     a_copy_dram_window,
-                                     a_dram_tile_window_step);
-                Base::GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
-                                     b_copy_dram_window,
-                                     b_dram_tile_window_step);
+                a_block_tiles.at(number<prefetch_idx>{}) =
+                    load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+
+                move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+                b_block_tiles.at(number<prefetch_idx>{}) =
+                    load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+
+                move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
             });
 
             // main body
@@ -837,16 +891,14 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                     a_shuffle_tmp,
                                     a_block_tiles.get(
                                         number<(prefetch_idx + 1) % PrefetchStages>{}));
-                                Base::LocalPrefill(
-                                    a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                                Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
                             }
                             else
                             {
                                 Base::LocalPrefill(
                                     a_copy_lds_window,
                                     a_block_tiles.get(
-                                        number<(prefetch_idx + 1) % PrefetchStages>{}),
-                                    a_element_func);
+                                        number<(prefetch_idx + 1) % PrefetchStages>{}));
                             }
                         }
 
@@ -860,39 +912,26 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                     b_shuffle_tmp,
                                     b_block_tiles.get(
                                         number<(prefetch_idx + 1) % PrefetchStages>{}));
-                                Base::LocalPrefill(
-                                    b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                                Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
                             }
                             else
                             {
                                 Base::LocalPrefill(
                                     b_copy_lds_window,
-                                    b_block_tiles.get(
-                                        number<(prefetch_idx + 1) % PrefetchStages>{}),
-                                    b_element_func);
-                            }
-                        }
-                        else
-                        {
-                            if constexpr(is_b_row_major && !is_b_load_tr_v())
-                            {
-                                auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
-                                    Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                                transpose_tile2d(
-                                    b_shuffle_tmp,
                                     b_block_tiles.get(
                                         number<(prefetch_idx + 1) % PrefetchStages>{}));
-                                b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}) =
-                                    b_shuffle_tmp;
                             }
                         }
 
-                        Base::GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
-                                             a_copy_dram_window,
-                                             a_dram_tile_window_step);
-                        Base::GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
-                                             b_copy_dram_window,
-                                             b_dram_tile_window_step);
+                        a_block_tiles.at(number<prefetch_idx>{}) =
+                            load_tile_with_elementwise(a_copy_dram_window, a_element_func);
+
+                        move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+
+                        b_block_tiles.at(number<prefetch_idx>{}) =
+                            load_tile_with_elementwise(b_copy_dram_window, b_element_func);
+
+                        move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
                     });
 
                     i += PrefetchStages;
@@ -913,28 +952,15 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                 Policy::template MakeShuffledARegTileDistribution<Problem>());
                             transpose_tile2d(a_shuffle_tmp,
                                              a_block_tiles.get(number<prefetch_idx>{}));
-                            Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                            Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp);
                         }
                         else
                         {
                             Base::LocalPrefill(a_copy_lds_window,
-                                               a_block_tiles.get(number<prefetch_idx>{}),
-                                               a_element_func);
-                        }
-                    }
-                    else
-                    {
-                        if constexpr(is_a_col_major && !is_a_load_tr_v())
-                        {
-                            auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
-                                Policy::template MakeShuffledARegTileDistribution<Problem>());
-                            transpose_tile2d(
-                                a_shuffle_tmp,
-                                a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}));
-                            a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}) =
-                                a_shuffle_tmp;
+                                               a_block_tiles.get(number<prefetch_idx>{}));
                         }
                     }
+
                     if constexpr(SkipBLds == false)
                     {
                         if constexpr(is_b_row_major && !is_b_load_tr_v())
@@ -943,26 +969,12 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                 Policy::template MakeShuffledBRegTileDistribution<Problem>());
                             transpose_tile2d(b_shuffle_tmp,
                                              b_block_tiles.get(number<prefetch_idx>{}));
-                            Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                            Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp);
                         }
                         else
                         {
                             Base::LocalPrefill(b_copy_lds_window,
-                                               b_block_tiles.get(number<prefetch_idx>{}),
-                                               b_element_func);
-                        }
-                    }
-                    else
-                    {
-                        if constexpr(is_b_row_major && !is_b_load_tr_v())
-                        {
-                            auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
-                                Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                            transpose_tile2d(
-                                b_shuffle_tmp,
-                                b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}));
-                            b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}) =
-                                b_shuffle_tmp;
+                                               b_block_tiles.get(number<prefetch_idx>{}));
                         }
                     }
                 });
@@ -1009,13 +1021,16 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
         }
     };
 
-    template <typename ADramBlockWindowTmp,
-              typename BDramBlockWindowTmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
               typename AElementFunction,
-              typename BElementFunction>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+              typename BElementFunction,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                    const AElementFunction& a_element_func,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    const BElementFunction& b_element_func,
                                    index_t num_loop,
                                    void* p_smem) const
@@ -1029,9 +1044,13 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             p_smem);
     }
 
-    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    index_t num_loop,
                                    bool has_hot_loop,
                                    TailNumber tail_number,
@@ -1040,7 +1059,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
         const auto RunPipeline = [&](auto hot_loop_, auto tail_num_) {
             constexpr bool hot_loop    = hot_loop_.value;
             constexpr auto tail_num    = tail_num_.value;
-            constexpr auto PassThrough = [](const auto& x) { return x; };
+            constexpr auto PassThrough = [](auto& e, const auto& x) { e = x; };
             return PipelineImpl<Scheduler>{}.template operator()<hot_loop, tail_num>(
                 a_dram_block_window_tmp,
                 PassThrough,
@@ -1052,20 +1071,82 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
         return Base::TailHandler(RunPipeline, has_hot_loop, tail_number);
     }
 
-    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    index_t num_loop,
                                    void* p_smem) const
     {
         return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
             a_dram_block_window_tmp,
-            [](const ADataType& a) { return a; },
+            [](auto& e, const ADataType& a) { e = a; },
             b_dram_block_window_tmp,
-            [](const BDataType& b) { return b; },
+            [](auto& e, const ADataType& a) { e = a; },
             num_loop,
             p_smem);
     }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BElementFunction& b_element_func,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          a_element_func,
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          b_element_func,
+                          num_loop,
+                          p_smem);
+    }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   bool has_hot_loop,
+                                   TailNumber tail_number,
+                                   void* p_smem) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          num_loop,
+                          has_hot_loop,
+                          tail_number,
+                          p_smem);
+    }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          num_loop,
+                          p_smem);
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index e3b4863392..eb363d59b8 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -15,14 +15,23 @@ namespace ck_tile {
 template <typename Problem, typename Policy = UniversalGemmPipelineAgBgCrPolicy>
 struct GemmPipelineAGmemBGmemCRegV1
 {
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using AsDataType = remove_cvref_t<typename Problem::AsDataTypeTuple>;
+    using BsDataType = remove_cvref_t<typename Problem::BsDataTypeTuple>;
+    using CDataType  = remove_cvref_t<typename Problem::CDataType>;
+
+    using AElementWise   = remove_cvref_t<typename Problem::AElementWise>;
+    using BElementWise   = remove_cvref_t<typename Problem::BElementWise>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
-    using ALayout = remove_cvref_t<typename Problem::ALayout>;
-    using BLayout = remove_cvref_t<typename Problem::BLayout>;
-    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+    using AsLayout = remove_cvref_t<typename Problem::AsLayoutTuple>;
+    using BsLayout = remove_cvref_t<typename Problem::BsLayoutTuple>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using ALayout = remove_cvref_t<std::tuple_element_t<0, AsLayout>>;
+    using BLayout = remove_cvref_t<std::tuple_element_t<0, BsLayout>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
 
     using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
 
@@ -81,17 +90,25 @@ struct GemmPipelineAGmemBGmemCRegV1
         return Policy::template GetSmemSize<Problem>();
     }
 
-    template <typename ADramBlockWindowTmp,
-              typename BDramBlockWindowTmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
               typename AElementFunction,
-              typename BElementFunction>
-    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+              typename BElementFunction,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_HOST_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                         const AElementFunction& a_element_func,
-                                        const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                        const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                         const BElementFunction& b_element_func,
                                         index_t num_loop,
                                         void* p_smem) const
     {
+        using ADramBlockWindowTmp =
+            remove_cvref_t<std::tuple_element_t<number<0>{}, AsDramBlockWindowTmp>>;
+        using BDramBlockWindowTmp =
+            remove_cvref_t<std::tuple_element_t<number<0>{}, BsDramBlockWindowTmp>>;
+
         static_assert(
             std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
                 std::is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
@@ -133,22 +150,30 @@ struct GemmPipelineAGmemBGmemCRegV1
         auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
 
         // A DRAM tile window for load
-        auto a_copy_dram_window =
-            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
-                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
-                             a_dram_block_window_tmp.get_window_origin(),
-                             Policy::template MakeADramTileDistribution<Problem>());
+        auto as_copy_dram_window = generate_tuple(
+            [&](auto idx) {
+                return make_tile_window(
+                    a_dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                    make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                    a_dram_block_window_tmp[number<idx>{}].get_window_origin(),
+                    Policy::template MakeADramTileDistribution<Problem>());
+            },
+            number<AsLayout::size()>{});
 
         // A LDS tile window for store
         auto a_copy_lds_window = make_tile_window(
             a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
 
         // B DRAM tile window for load
-        auto b_copy_dram_window =
-            make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
-                             make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
-                             b_dram_block_window_tmp.get_window_origin(),
-                             Policy::template MakeBDramTileDistribution<Problem>());
+        auto bs_copy_dram_window = generate_tuple(
+            [&](auto idx) {
+                return make_tile_window(
+                    b_dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                    make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
+                    b_dram_block_window_tmp[number<idx>{}].get_window_origin(),
+                    Policy::template MakeBDramTileDistribution<Problem>());
+            },
+            number<BsLayout::size()>{});
 
         // B LDS tile window for store
         auto b_copy_lds_window = make_tile_window(
@@ -182,13 +207,22 @@ struct GemmPipelineAGmemBGmemCRegV1
 
         // prefetch
         // global read 0
-        auto a_block_tile = load_tile(a_copy_dram_window);
-        auto b_block_tile = load_tile(b_copy_dram_window);
+        // Load tile — during value loading, an elementwise function is executed for each A0,
+        // A1, … AN. The values A0, A1, … AN are read by the same thread.
+        auto elementwise_As_res = load_tile_with_elementwise(as_copy_dram_window, a_element_func);
+
+        // Load tile — during value loading, an elementwise function is executed for each B0,
+        // B1, … BN. The values B0, B1, … BN are read by the same thread.
+        auto elementwise_Bs_res = load_tile_with_elementwise(bs_copy_dram_window, b_element_func);
 
         {
             // move to 1
-            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
-            move_tile_window(b_copy_dram_window, {0, kKPerBlock});
+            // Move each A — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(as_copy_dram_window, {0, kKPerBlock});
+            // Move each B — the enhanced function move_tile_window is executed, which takes a tuple
+            // as input.
+            move_tile_window(bs_copy_dram_window, {0, kKPerBlock});
 
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
@@ -198,13 +232,12 @@ struct GemmPipelineAGmemBGmemCRegV1
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
-                transpose_tile2d(a_shuffle_tmp, a_block_tile);
-                const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp);
-                store_tile(a_copy_lds_window, a_block_tile_tmp);
+                transpose_tile2d(a_shuffle_tmp, elementwise_As_res);
+                store_tile(a_copy_lds_window, a_shuffle_tmp);
             }
             else
             {
-                store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile));
+                store_tile(a_copy_lds_window, elementwise_As_res);
             }
 
             // LDS write 0
@@ -212,13 +245,12 @@ struct GemmPipelineAGmemBGmemCRegV1
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                transpose_tile2d(b_shuffle_tmp, b_block_tile);
-                const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_shuffle_tmp);
-                store_tile(b_copy_lds_window, b_block_tile_tmp);
+                transpose_tile2d(b_shuffle_tmp, elementwise_Bs_res);
+                store_tile(b_copy_lds_window, b_shuffle_tmp);
             }
             else
             {
-                store_tile(b_copy_lds_window, tile_elementwise_in(b_element_func, b_block_tile));
+                store_tile(b_copy_lds_window, elementwise_Bs_res);
             }
         }
 
@@ -226,8 +258,8 @@ struct GemmPipelineAGmemBGmemCRegV1
         while(iCounter > 0)
         {
             // global read i + 1
-            a_block_tile = load_tile(a_copy_dram_window);
-            b_block_tile = load_tile(b_copy_dram_window);
+            elementwise_As_res = load_tile_with_elementwise(as_copy_dram_window, a_element_func);
+            elementwise_Bs_res = load_tile_with_elementwise(bs_copy_dram_window, b_element_func);
 
             block_sync_lds();
 
@@ -237,22 +269,20 @@ struct GemmPipelineAGmemBGmemCRegV1
             block_sync_lds();
 
             // move to i + 2
-            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
-            move_tile_window(b_copy_dram_window, {0, kKPerBlock});
+            move_tile_window(as_copy_dram_window, {0, kKPerBlock});
+            move_tile_window(bs_copy_dram_window, {0, kKPerBlock});
 
             // LDS write i + 1
             if constexpr(is_a_col_major)
             {
                 auto a_shuffle_tmp_loop = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
-                transpose_tile2d(a_shuffle_tmp_loop, a_block_tile);
-                store_tile(a_copy_lds_window,
-                           tile_elementwise_in(a_element_func, a_shuffle_tmp_loop));
+                transpose_tile2d(a_shuffle_tmp_loop, elementwise_As_res);
+                store_tile(a_copy_lds_window, a_shuffle_tmp_loop);
             }
             else
             {
-                const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
-                store_tile(a_copy_lds_window, a_block_tile_tmp);
+                store_tile(a_copy_lds_window, elementwise_As_res);
             }
 
             // LDS write i + 1
@@ -260,14 +290,12 @@ struct GemmPipelineAGmemBGmemCRegV1
             {
                 auto b_shuffle_tmp_loop = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
-                transpose_tile2d(b_shuffle_tmp_loop, b_block_tile);
-                store_tile(b_copy_lds_window,
-                           tile_elementwise_in(b_element_func, b_shuffle_tmp_loop));
+                transpose_tile2d(b_shuffle_tmp_loop, elementwise_Bs_res);
+                store_tile(b_copy_lds_window, b_shuffle_tmp_loop);
             }
             else
             {
-                const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
-                store_tile(b_copy_lds_window, b_block_tile_tmp);
+                store_tile(b_copy_lds_window, elementwise_Bs_res);
             }
 
             iCounter--;
@@ -284,20 +312,40 @@ struct GemmPipelineAGmemBGmemCRegV1
         return c_block_tile;
     }
 
-    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
-    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                    index_t num_loop,
                                    void* p_smem) const
     {
         return operator()(
             a_dram_block_window_tmp,
-            [](const ADataType & a) { return a; },
+            [](auto& e, const ADataType & a) { e = a; },
             b_dram_block_window_tmp,
-            [](const BDataType & b) { return b; },
+            [](auto& e, const BDataType & b) { e = b; },
             num_loop,
             p_smem);
     }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          num_loop,
+                          p_smem);
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
index b151cd6782..c309f8908a 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -15,30 +15,66 @@ namespace ck_tile {
 template <typename Problem, typename Policy = GemmPipelineAGmemBGmemCRegV2DefaultPolicy>
 struct GemmPipelineAGmemBGmemCRegV2
 {
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using AsDataType = remove_cvref_t<typename Problem::AsDataTypeTuple>;
+    using BsDataType = remove_cvref_t<typename Problem::BsDataTypeTuple>;
+    using CDataType  = remove_cvref_t<typename Problem::CDataType>;
+
+    using AElementWise   = remove_cvref_t<typename Problem::AElementWise>;
+    using BElementWise   = remove_cvref_t<typename Problem::BElementWise>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
+    using AsLayout = remove_cvref_t<typename Problem::AsLayoutTuple>;
+    using BsLayout = remove_cvref_t<typename Problem::BsLayoutTuple>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using ALayout = remove_cvref_t<std::tuple_element_t<0, AsLayout>>;
+    using BLayout = remove_cvref_t<std::tuple_element_t<0, BsLayout>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
+
     static constexpr index_t APackedSize =
         ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
     static constexpr index_t BPackedSize =
         ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
 
-    static constexpr index_t kBlockSize = Problem::kBlockSize;
+    static constexpr index_t BlockSize = Problem::kBlockSize;
 
     static constexpr index_t kMPerBlock = BlockGemmShape::kM;
     static constexpr index_t kNPerBlock = BlockGemmShape::kN;
     static constexpr index_t kKPerBlock = BlockGemmShape::kK;
 
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeA()
+    {
+        return Problem::VectorSizeA;
+    }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeB()
+    {
+        return Problem::VectorSizeB;
+    }
+    static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; }
+
     static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
     static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
 
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
+
+    static constexpr bool Preshuffle = Problem::Preshuffle;
+
+    static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
+
+    // For the basic gemm pipelien DoubleSmemBuffer set to be false naturally.
+    static constexpr bool DoubleSmemBuffer = false;
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
         return concat('_', "pipeline_AGmemBGmemCRegV2",
-                      concat('x', kMPerBlock, kNPerBlock, kKPerBlock, kBlockSize));
+                      concat('x', kMPerBlock, kNPerBlock, kKPerBlock, BlockSize));
         // clang-format on
     }
     CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
@@ -56,17 +92,31 @@ struct GemmPipelineAGmemBGmemCRegV2
                    BPackedSize;
     }
 
-    template <typename ADramBlockWindowTmp,
-              typename BDramBlockWindowTmp,
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename AsDramBlockWindowTmp,
+              typename BsDramBlockWindowTmp,
               typename AElementFunction,
-              typename BElementFunction>
-    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+              typename BElementFunction,
+              typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BsDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_HOST_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
                                         const AElementFunction& a_element_func,
-                                        const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                        const BsDramBlockWindowTmp& b_dram_block_window_tmp,
                                         const BElementFunction& b_element_func,
                                         index_t num_loop,
                                         void* p_smem) const
     {
+
+        using ADramBlockWindowTmp =
+            remove_cvref_t<std::tuple_element_t<number<0>{}, AsDramBlockWindowTmp>>;
+        using BDramBlockWindowTmp =
+            remove_cvref_t<std::tuple_element_t<number<0>{}, BsDramBlockWindowTmp>>;
+
         static_assert(
             std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
                 std::is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
@@ -98,32 +148,40 @@ struct GemmPipelineAGmemBGmemCRegV2
         auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
 
         // A DRAM tile window for load
-        auto a_copy_dram_window =
-            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
-                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
-                             a_dram_block_window_tmp.get_window_origin(),
-                             Policy::template MakeADramTileDistribution<Problem>());
+        auto as_copy_dram_window = generate_tuple(
+            [&](auto idx) {
+                return make_tile_window(
+                    a_dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                    make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                    a_dram_block_window_tmp[number<idx>{}].get_window_origin(),
+                    Policy::template MakeADramTileDistribution<Problem>());
+            },
+            number<AsLayout::size()>{});
 
         // A LDS tile window for store
         auto a_copy_lds_window =
             make_tile_window(a_lds_block,
                              make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
                              {0, 0},
-                             a_copy_dram_window.get_tile_distribution());
+                             as_copy_dram_window[number<0>{}].get_tile_distribution());
 
         // B DRAM tile window for load
-        auto b_copy_dram_window =
-            make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
-                             make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
-                             b_dram_block_window_tmp.get_window_origin(),
-                             Policy::template MakeBDramTileDistribution<Problem>());
+        auto bs_copy_dram_window = generate_tuple(
+            [&](auto idx) {
+                return make_tile_window(
+                    b_dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
+                    make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
+                    b_dram_block_window_tmp[number<idx>{}].get_window_origin(),
+                    Policy::template MakeBDramTileDistribution<Problem>());
+            },
+            number<BsLayout::size()>{});
 
         // B LDS tile window for store
         auto b_copy_lds_window =
             make_tile_window(b_lds_block,
                              make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
                              {0, 0},
-                             b_copy_dram_window.get_tile_distribution());
+                             bs_copy_dram_window[number<0>{}].get_tile_distribution());
 
         // Block GEMM
         constexpr auto block_gemm = Policy::template GetBlockGemm<Problem>();
@@ -153,28 +211,30 @@ struct GemmPipelineAGmemBGmemCRegV2
 
         // prefetch
         // global read 0
-        auto a_block_tile = load_tile(a_copy_dram_window);
-        auto b_block_tile = load_tile(b_copy_dram_window);
+        // Load tile — during value loading, an elementwise function is executed for each A0,
+        // A1, … AN. The values A0, A1, … AN are read by the same thread.
+        auto elementwise_As_res = load_tile_with_elementwise(as_copy_dram_window, a_element_func);
+        // Load tile — during value loading, an elementwise function is executed for each B0,
+        // B1, … BN. The values B0, B1, … BN are read by the same thread.
+        auto elementwise_Bs_res = load_tile_with_elementwise(bs_copy_dram_window, b_element_func);
 
         {
             // move to 1
-            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
-            move_tile_window(b_copy_dram_window, {0, kKPerBlock});
+            move_tile_window(as_copy_dram_window, {0, kKPerBlock});
+            move_tile_window(bs_copy_dram_window, {0, kKPerBlock});
 
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
             // LDS write 0
-            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
-            store_tile(a_copy_lds_window, a_block_tile_tmp);
+            store_tile(a_copy_lds_window, elementwise_As_res);
             // global read 1
-            a_block_tile = load_tile(a_copy_dram_window);
+            elementwise_As_res = load_tile_with_elementwise(as_copy_dram_window, a_element_func);
 
             // LDS write 0
-            const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
-            store_tile(b_copy_lds_window, b_block_tile_tmp);
+            store_tile(b_copy_lds_window, elementwise_Bs_res);
             // global read 1
-            b_block_tile = load_tile(b_copy_dram_window);
+            elementwise_Bs_res = load_tile_with_elementwise(bs_copy_dram_window, b_element_func);
         }
 
         index_t iCounter = num_loop - 2;
@@ -189,20 +249,18 @@ struct GemmPipelineAGmemBGmemCRegV2
             block_sync_lds();
 
             // move to i + 2
-            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
-            move_tile_window(b_copy_dram_window, {0, kKPerBlock});
+            move_tile_window(as_copy_dram_window, {0, kKPerBlock});
+            move_tile_window(bs_copy_dram_window, {0, kKPerBlock});
 
             // LDS write i + 1
-            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
-            store_tile(a_copy_lds_window, a_block_tile_tmp);
+            store_tile(a_copy_lds_window, elementwise_As_res);
             // global read i + 2
-            a_block_tile = load_tile(a_copy_dram_window);
+            elementwise_As_res = load_tile_with_elementwise(as_copy_dram_window, a_element_func);
 
             // LDS write i + 1
-            const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
-            store_tile(b_copy_lds_window, b_block_tile_tmp);
+            store_tile(b_copy_lds_window, elementwise_Bs_res);
             // global read i + 2
-            b_block_tile = load_tile(b_copy_dram_window);
+            elementwise_Bs_res = load_tile_with_elementwise(bs_copy_dram_window, b_element_func);
 
             iCounter--;
 
@@ -218,11 +276,9 @@ struct GemmPipelineAGmemBGmemCRegV2
             block_sync_lds();
 
             // LDS write num_loop - 1
-            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
-            store_tile(a_copy_lds_window, a_block_tile_tmp);
+            store_tile(a_copy_lds_window, elementwise_As_res);
 
-            const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
-            store_tile(b_copy_lds_window, b_block_tile_tmp);
+            store_tile(b_copy_lds_window, elementwise_Bs_res);
 
             block_sync_lds();
 
@@ -241,12 +297,28 @@ struct GemmPipelineAGmemBGmemCRegV2
     {
         return operator()(
             a_dram_block_window_tmp,
-            [](const ADataType & a) { return a; },
+            [](auto& e, const ADataType & a) { e = a; },
             b_dram_block_window_tmp,
-            [](const BDataType & b) { return b; },
+            [](auto& e, const BDataType & b) { e = b; },
             num_loop,
             p_smem);
     }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BDramBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(ck_tile::make_tuple(a_dram_block_window_tmp),
+                          ck_tile::make_tuple(b_dram_block_window_tmp),
+                          num_loop,
+                          p_smem);
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index 4c8401a007..10f5c31525 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -5,16 +5,19 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 #include "ck_tile/host/concat.hpp"
 
 namespace ck_tile {
 
-template <typename ADataType_,
-          typename BDataType_,
-          typename CDataType_,
+template <typename AsDataType_,
+          typename BsDataType_,
+          typename EDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          typename ComputeDataType_ = ADataType_,
+          typename ComputeDataType_ = AsDataType_,
+          typename AElementWise_    = ck_tile::element_wise::PassThrough,
+          typename BElementWise_    = ck_tile::element_wise::PassThrough,
           bool FixedVectorSize_     = false,
           index_t VectorSizeA_      = 1,
           index_t VectorSizeB_      = 1>
@@ -22,18 +25,49 @@ struct GemmPipelineProblemBase
 {
     using Traits = remove_cvref_t<Traits_>;
 
-    using ADataType       = remove_cvref_t<ADataType_>;
-    using BDataType       = remove_cvref_t<BDataType_>;
-    using CDataType       = remove_cvref_t<CDataType_>; // actually AccDataType
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using AsDataType = remove_cvref_t<AsDataType_>;
+    using BsDataType = remove_cvref_t<BsDataType_>;
+    using CDataType  = remove_cvref_t<EDataType_>; // actually AccDataType
 
     static constexpr bool FixedVectorSize = FixedVectorSize_;
 
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
-    using ALayout = remove_cvref_t<typename Traits::ALayout>;
-    using BLayout = remove_cvref_t<typename Traits::BLayout>;
-    using CLayout = remove_cvref_t<typename Traits::CLayout>;
+    using AElementWise = remove_cvref_t<AElementWise_>;
+    using BElementWise = remove_cvref_t<BElementWise_>;
+
+    using AsLayout = remove_cvref_t<typename Traits::AsLayout>;
+    using BsLayout = remove_cvref_t<typename Traits::BsLayout>;
+    using CLayout  = remove_cvref_t<typename Traits::CLayout>;
+
+    static constexpr bool ComputeDataTypeIsTuple = is_detected<is_tuple, ComputeDataType_>::value;
+    static constexpr bool ADataTypeIsTuple       = is_detected<is_tuple, AsDataType>::value;
+    static constexpr bool BDataTypeIsTuple       = is_detected<is_tuple, BsDataType>::value;
+
+    static constexpr bool ALayoutIsTuple = is_detected<is_tuple, AsLayout>::value;
+    static constexpr bool BLayoutIsTuple = is_detected<is_tuple, BsLayout>::value;
+
+    using ComputeDataTypeTuple = std::conditional_t<ComputeDataTypeIsTuple,
+                                                    remove_cvref_t<ComputeDataType_>,
+                                                    remove_cvref_t<tuple<ComputeDataType_>>>;
+    using AsLayoutTuple        = std::
+        conditional_t<ALayoutIsTuple, remove_cvref_t<AsLayout>, remove_cvref_t<tuple<AsLayout>>>;
+    using BsLayoutTuple = std::
+        conditional_t<BLayoutIsTuple, remove_cvref_t<BsLayout>, remove_cvref_t<tuple<BsLayout>>>;
+
+    using AsDataTypeTuple = std::conditional_t<ADataTypeIsTuple,
+                                               remove_cvref_t<AsDataType>,
+                                               remove_cvref_t<tuple<AsDataType>>>;
+
+    using BsDataTypeTuple = std::conditional_t<BDataTypeIsTuple,
+                                               remove_cvref_t<BsDataType>,
+                                               remove_cvref_t<tuple<BsDataType>>>;
+
+    using ComputeDataType = remove_cvref_t<std::tuple_element_t<number<0>{}, ComputeDataTypeTuple>>;
+    using ADataType       = remove_cvref_t<std::tuple_element_t<number<0>{}, AsDataTypeTuple>>;
+    using ALayout         = remove_cvref_t<std::tuple_element_t<number<0>{}, AsLayoutTuple>>;
+    using BDataType       = remove_cvref_t<std::tuple_element_t<number<0>{}, BsDataTypeTuple>>;
+    using BLayout         = remove_cvref_t<std::tuple_element_t<number<0>{}, BsLayoutTuple>>;
 
     static constexpr bool TransposeC            = Traits::TransposeC;
     static constexpr index_t NumWaveGroups      = Traits::NumWaveGroups;
@@ -128,7 +162,7 @@ struct GemmPipelineProblemBase
         {
             return VectorSizeA_;
         }
-        else if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        else if constexpr(std::is_same_v<AsLayout, tensor_layout::gemm::RowMajor>)
         {
             return kPadK ? 1 : GetAlignmentA();
         }
@@ -143,7 +177,7 @@ struct GemmPipelineProblemBase
         {
             return VectorSizeB_;
         }
-        else if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+        else if constexpr(std::is_same_v<BsLayout, tensor_layout::gemm::ColumnMajor>)
         {
             return kPadN ? 1 : GetAlignmentB();
         }
@@ -164,35 +198,40 @@ struct GemmPipelineProblemBase
     }();
 };
 
-// Alias for GemmPipelineProblem
-template <typename ADataType_,
-          typename BDataType_,
-          typename CDataType_,
+template <typename AsDataType_,
+          typename BsDataType_,
+          typename EDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          typename ComputeDataType_ = ADataType_,
+          typename AElementWise_    = ck_tile::element_wise::PassThrough,
+          typename BElementWise_    = ck_tile::element_wise::PassThrough,
+          typename ComputeDataType_ = AsDataType_,
           bool FixedVectorSize_     = false,
           index_t VectorSizeA_      = 1,
           index_t VectorSizeB_      = 1>
-using GemmPipelineProblem = GemmPipelineProblemBase<ADataType_,
-                                                    BDataType_,
-                                                    CDataType_,
+using GemmPipelineProblem = GemmPipelineProblemBase<AsDataType_,
+                                                    BsDataType_,
+                                                    EDataType_,
                                                     BlockGemmShape_,
                                                     Traits_,
                                                     ComputeDataType_,
+                                                    AElementWise_,
+                                                    BElementWise_,
                                                     FixedVectorSize_,
                                                     VectorSizeA_,
                                                     VectorSizeB_>;
 
-template <typename ADataType_,
-          typename BDataType_,
-          typename CDataType_,
+template <typename AsDataType_,
+          typename BsDataType_,
+          typename EDataType_,
           typename BlockGemmShape_,
           typename Traits_,
           GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
           bool HasHotLoop_                 = true,
           TailNumber TailNum_              = TailNumber::Full,
-          typename ComputeDataType_        = ADataType_,
+          typename AElementWise_           = ck_tile::element_wise::PassThrough,
+          typename BElementWise_           = ck_tile::element_wise::PassThrough,
+          typename ComputeDataType_        = AsDataType_,
           bool FixedVectorSize_            = false,
           index_t VectorSizeA_             = 1,
           index_t VectorSizeB_             = 1>
@@ -200,18 +239,48 @@ struct UniversalGemmPipelineProblem
 {
     using Traits = remove_cvref_t<Traits_>;
 
-    using ADataType       = remove_cvref_t<ADataType_>;
-    using BDataType       = remove_cvref_t<BDataType_>;
-    using CDataType       = remove_cvref_t<CDataType_>; // actually AccDataType
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using AsDataType   = remove_cvref_t<AsDataType_>;
+    using BsDataType   = remove_cvref_t<BsDataType_>;
+    using CDataType    = remove_cvref_t<EDataType_>; // actually AccDataType
+    using AElementWise = remove_cvref_t<AElementWise_>;
+    using BElementWise = remove_cvref_t<BElementWise_>;
 
     static constexpr bool FixedVectorSize = FixedVectorSize_;
 
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
-    using ALayout = remove_cvref_t<typename Traits::ALayout>;
-    using BLayout = remove_cvref_t<typename Traits::BLayout>;
-    using CLayout = remove_cvref_t<typename Traits::CLayout>;
+    using AsLayout = remove_cvref_t<typename Traits::AsLayout>;
+    using BsLayout = remove_cvref_t<typename Traits::BsLayout>;
+    using CLayout  = remove_cvref_t<typename Traits::CLayout>;
+
+    static constexpr bool ComputeDataTypeIsTuple = is_detected<is_tuple, ComputeDataType_>::value;
+    static constexpr bool ADataTypeIsTuple       = is_detected<is_tuple, AsDataType>::value;
+    static constexpr bool BDataTypeIsTuple       = is_detected<is_tuple, BsDataType>::value;
+
+    static constexpr bool ALayoutIsTuple = is_detected<is_tuple, AsLayout>::value;
+    static constexpr bool BLayoutIsTuple = is_detected<is_tuple, BsLayout>::value;
+
+    using ComputeDataTypeTuple = std::conditional_t<ComputeDataTypeIsTuple,
+                                                    remove_cvref_t<ComputeDataType_>,
+                                                    remove_cvref_t<tuple<ComputeDataType_>>>;
+    using AsLayoutTuple        = std::
+        conditional_t<ALayoutIsTuple, remove_cvref_t<AsLayout>, remove_cvref_t<tuple<AsLayout>>>;
+    using BsLayoutTuple = std::
+        conditional_t<BLayoutIsTuple, remove_cvref_t<BsLayout>, remove_cvref_t<tuple<BsLayout>>>;
+
+    using AsDataTypeTuple = std::conditional_t<ADataTypeIsTuple,
+                                               remove_cvref_t<AsDataType>,
+                                               remove_cvref_t<tuple<AsDataType>>>;
+
+    using BsDataTypeTuple = std::conditional_t<BDataTypeIsTuple,
+                                               remove_cvref_t<BsDataType>,
+                                               remove_cvref_t<tuple<BsDataType>>>;
+
+    using ComputeDataType = remove_cvref_t<std::tuple_element_t<number<0>{}, ComputeDataTypeTuple>>;
+    using ADataType       = remove_cvref_t<std::tuple_element_t<number<0>{}, AsDataTypeTuple>>;
+    using ALayout         = remove_cvref_t<std::tuple_element_t<number<0>{}, AsLayoutTuple>>;
+    using BDataType       = remove_cvref_t<std::tuple_element_t<number<0>{}, BsDataTypeTuple>>;
+    using BLayout         = remove_cvref_t<std::tuple_element_t<number<0>{}, BsLayoutTuple>>;
 
     static constexpr bool TransposeC            = Traits::TransposeC;
     static constexpr index_t NumWaveGroups      = Traits::NumWaveGroups;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 1af40fbd4d..387084cb4a 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -168,11 +168,11 @@ struct UniversalGemmBasePolicy
         {
             constexpr index_t BlockSize   = Problem::kBlockSize;
             constexpr index_t VecLoadSize = GetVectorSizeB<Problem>();
-            using TileEncodingPattern     = TileDistributionEncodingPattern2D<BlockSize,
-                                                                              KPerBlock,
-                                                                              NPerBlock,
-                                                                              VecLoadSize,
-                                                                              BTileAccessPattern>;
+            using TileEncodingPattern     = tile_distribution_encoding_pattern_2d<BlockSize,
+                                                                                  KPerBlock,
+                                                                                  NPerBlock,
+                                                                                  VecLoadSize,
+                                                                                  BTileAccessPattern>;
 
             constexpr auto BK0 = number<TileEncodingPattern::X1>{};
             constexpr auto BK1 = number<TileEncodingPattern::Y0>{};
@@ -356,11 +356,14 @@ struct UniversalGemmBasePolicy
     template <typename Problem, bool IsWave32Host = false>
     CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeA()
     {
-        using ALayout               = remove_cvref_t<typename Problem::ALayout>;
-        using ADataType             = remove_cvref_t<typename Problem::ADataType>;
+        using AsLayout              = remove_cvref_t<typename Problem::AsLayoutTuple>;
+        using AsDataType            = remove_cvref_t<typename Problem::AsDataTypeTuple>;
         constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
 
+        using ALayout   = remove_cvref_t<std::tuple_element_t<number<0>{}, AsLayout>>;
+        using ADataType = remove_cvref_t<std::tuple_element_t<number<0>{}, AsDataType>>;
+
         if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
             return GetGlobalVectorLoadSize<Problem,
@@ -382,11 +385,14 @@ struct UniversalGemmBasePolicy
     template <typename Problem, bool IsWave32Host = false>
     CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeB()
     {
-        using BLayout               = remove_cvref_t<typename Problem::BLayout>;
-        using BDataType             = remove_cvref_t<typename Problem::BDataType>;
+        using BsLayout              = remove_cvref_t<typename Problem::BsLayoutTuple>;
+        using BsDataType            = remove_cvref_t<typename Problem::BsDataTypeTuple>;
         constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
 
+        using BLayout   = remove_cvref_t<std::tuple_element_t<number<0>{}, BsLayout>>;
+        using BDataType = remove_cvref_t<std::tuple_element_t<number<0>{}, BsDataType>>;
+
         if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
             return GetGlobalVectorLoadSize<Problem,
@@ -482,8 +488,6 @@ struct UniversalGemmBasePolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
     {
-        using ALayout = remove_cvref_t<typename Problem::ALayout>;
-
         constexpr index_t BlockSize = Problem::kBlockSize;
         constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
@@ -491,35 +495,35 @@ struct UniversalGemmBasePolicy
             Problem::FixedVectorSize ? Problem::VectorSizeA : GetVectorSizeA<Problem>();
         constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
+        using ALayout = remove_cvref_t<
+            std::tuple_element_t<number<0>{}, remove_cvref_t<typename Problem::AsLayoutTuple>>>;
         // Tile: MPerBlock X KPerBlock
         if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
-            using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                          MPerBlock,
-                                                                          KPerBlock,
-                                                                          VecLoadSize,
-                                                                          ATileAccessPattern,
-                                                                          NumWaveGroups>;
-            return TileEncodingPattern::Make2DStaticTileDistribution();
+            using TileEncodingPattern = tile_distribution_encoding_pattern_2d<BlockSize,
+                                                                              MPerBlock,
+                                                                              KPerBlock,
+                                                                              VecLoadSize,
+                                                                              ATileAccessPattern,
+                                                                              NumWaveGroups>;
+            return TileEncodingPattern::make_2d_static_tile_distribution();
         }
         // Tile: KPerBlock X MPerBlock
         else
         {
-            using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                          KPerBlock,
-                                                                          MPerBlock,
-                                                                          VecLoadSize,
-                                                                          ATileAccessPattern,
-                                                                          NumWaveGroups>;
-            return TileEncodingPattern::Make2DStaticTileDistribution();
+            using TileEncodingPattern = tile_distribution_encoding_pattern_2d<BlockSize,
+                                                                              KPerBlock,
+                                                                              MPerBlock,
+                                                                              VecLoadSize,
+                                                                              ATileAccessPattern,
+                                                                              NumWaveGroups>;
+            return TileEncodingPattern::make_2d_static_tile_distribution();
         }
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution()
     {
-        using BLayout = remove_cvref_t<typename Problem::BLayout>;
-
         constexpr index_t BlockSize = Problem::kBlockSize;
         constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
@@ -527,34 +531,37 @@ struct UniversalGemmBasePolicy
             Problem::FixedVectorSize ? Problem::VectorSizeB : GetVectorSizeB<Problem>();
         constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
+        using BLayout = remove_cvref_t<
+            std::tuple_element_t<number<0>{}, remove_cvref_t<typename Problem::BsLayoutTuple>>>;
         // Tile: KPerBlock X NPerBlock
         if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
-            using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                          KPerBlock,
-                                                                          NPerBlock,
-                                                                          VecLoadSize,
-                                                                          BTileAccessPattern,
-                                                                          NumWaveGroups>;
-            return TileEncodingPattern::Make2DStaticTileDistribution();
+            using TileEncodingPattern = tile_distribution_encoding_pattern_2d<BlockSize,
+                                                                              KPerBlock,
+                                                                              NPerBlock,
+                                                                              VecLoadSize,
+                                                                              BTileAccessPattern,
+                                                                              NumWaveGroups>;
+            return TileEncodingPattern::make_2d_static_tile_distribution();
         }
         // Tile: NPerBlock X KPerBlock
         else
         {
-            using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                          NPerBlock,
-                                                                          KPerBlock,
-                                                                          VecLoadSize,
-                                                                          BTileAccessPattern,
-                                                                          NumWaveGroups>;
-            return TileEncodingPattern::Make2DStaticTileDistribution();
+            using TileEncodingPattern = tile_distribution_encoding_pattern_2d<BlockSize,
+                                                                              NPerBlock,
+                                                                              KPerBlock,
+                                                                              VecLoadSize,
+                                                                              BTileAccessPattern,
+                                                                              NumWaveGroups>;
+            return TileEncodingPattern::make_2d_static_tile_distribution();
         }
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegTileDistribution()
     {
-        using ALayout = remove_cvref_t<typename Problem::ALayout>;
+        using ALayout = remove_cvref_t<
+            std::tuple_element_t<number<0>{}, remove_cvref_t<typename Problem::AsLayoutTuple>>>;
         static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>);
         constexpr index_t BlockSize     = Problem::kBlockSize;
         constexpr index_t MPerBlock     = Problem::BlockGemmShape::kM;
@@ -562,19 +569,20 @@ struct UniversalGemmBasePolicy
         constexpr index_t VecLoadSize   = GetVectorSizeA<Problem>();
         constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
-        using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                      KPerBlock,
-                                                                      MPerBlock,
-                                                                      VecLoadSize,
-                                                                      ATileAccessPattern,
-                                                                      NumWaveGroups>;
-        return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
+        using TileEncodingPattern = tile_distribution_encoding_pattern_2d<BlockSize,
+                                                                          KPerBlock,
+                                                                          MPerBlock,
+                                                                          VecLoadSize,
+                                                                          ATileAccessPattern,
+                                                                          NumWaveGroups>;
+        return TileEncodingPattern::make_shuffled_2d_static_tile_distribution();
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBRegTileDistribution()
     {
-        using BLayout = remove_cvref_t<typename Problem::BLayout>;
+        using BLayout = remove_cvref_t<
+            std::tuple_element_t<number<0>{}, remove_cvref_t<typename Problem::BsLayoutTuple>>>;
         static_assert(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>);
         constexpr index_t BlockSize     = Problem::kBlockSize;
         constexpr index_t NPerBlock     = Problem::BlockGemmShape::kN;
@@ -582,13 +590,13 @@ struct UniversalGemmBasePolicy
         constexpr index_t VecLoadSize   = GetVectorSizeB<Problem>();
         constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
-        using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                      KPerBlock,
-                                                                      NPerBlock,
-                                                                      VecLoadSize,
-                                                                      BTileAccessPattern,
-                                                                      NumWaveGroups>;
-        return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
+        using TileEncodingPattern = tile_distribution_encoding_pattern_2d<BlockSize,
+                                                                          KPerBlock,
+                                                                          NPerBlock,
+                                                                          VecLoadSize,
+                                                                          BTileAccessPattern,
+                                                                          NumWaveGroups>;
+        return TileEncodingPattern::make_shuffled_2d_static_tile_distribution();
     }
 
     template <typename Problem>
@@ -637,13 +645,13 @@ struct UniversalGemmBasePolicy
 
 // Forward declarations, to avoid circular dependencies
 // induced by BlockUniversalGemmBase's policy dependency
-template <typename Problem, typename Policy>
+template <typename Problem, typename Policy, index_t UnaryOpSize>
 struct BlockUniversalGemmAsBsCr;
-template <typename Problem, typename Policy>
+template <typename Problem, typename Policy, index_t UnaryOpSize>
 struct BlockUniversalGemmAsBrCr;
-template <typename Problem, typename Policy>
+template <typename Problem, typename Policy, index_t UnaryOpSize>
 struct BlockUniversalGemmArBsCr;
-template <typename Problem, typename Policy>
+template <typename Problem, typename Policy, index_t UnaryOpSize>
 struct BlockUniversalGemmArBrCr;
 
 // UniversalGemm Policy
@@ -677,6 +685,8 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                             Problem::UseStructuredSparsity,
                                             wg_attr_num_access>;
 
+        constexpr index_t unary_op_size = 8;
+
         if constexpr(Problem::SkipALds == false && Problem::SkipBLds == false)
         {
             using BlockGemmPolicy =
@@ -685,7 +695,7 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                                       typename Problem::CDataType,
                                                       BlockWarps,
                                                       WarpGemm>;
-            return BlockUniversalGemmAsBsCr<Problem, BlockGemmPolicy>{};
+            return BlockUniversalGemmAsBsCr<Problem, BlockGemmPolicy, unary_op_size>{};
         }
         else if constexpr(Problem::SkipALds == false && Problem::SkipBLds == true)
         {
@@ -695,7 +705,7 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                                      typename Problem::CDataType,
                                                      BlockWarps,
                                                      WarpGemm>;
-            return BlockUniversalGemmAsBrCr<Problem, BlockGemmPolicy>{};
+            return BlockUniversalGemmAsBrCr<Problem, BlockGemmPolicy, unary_op_size>{};
         }
         else if constexpr(Problem::SkipALds == true && Problem::SkipBLds == false)
         {
@@ -705,7 +715,7 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                                      typename Problem::CDataType,
                                                      BlockWarps,
                                                      WarpGemm>;
-            return BlockUniversalGemmArBsCr<Problem, BlockGemmPolicy>{};
+            return BlockUniversalGemmArBsCr<Problem, BlockGemmPolicy, unary_op_size>{};
         }
         else
         {
@@ -714,7 +724,7 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                                                         typename Problem::CDataType,
                                                                         BlockWarps,
                                                                         WarpGemm>;
-            return BlockUniversalGemmArBrCr<Problem, BlockGemmPolicy>{};
+            return BlockUniversalGemmArBrCr<Problem, BlockGemmPolicy, unary_op_size>{};
         }
     }
 };
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index 933e8ccdcb..ecaca46f49 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -10,8 +10,8 @@ namespace ck_tile {
 template <bool kPadM_,
           bool kPadN_,
           bool kPadK_,
-          typename ALayout_,
-          typename BLayout_,
+          typename AsLayout_,
+          typename BsLayout_,
           typename CLayout_,
           index_t NumWaveGroups_ = 1>
 struct TileGemmTraits
@@ -23,9 +23,9 @@ struct TileGemmTraits
     // TODO this can't be hardcoded here! Should be in policy!
     static constexpr int _VectorSize = 16;
 
-    using ALayout = ALayout_;
-    using BLayout = BLayout_;
-    using CLayout = CLayout_;
+    using AsLayout = AsLayout_;
+    using BsLayout = BsLayout_;
+    using CLayout  = CLayout_;
 
     static constexpr bool TransposeC            = false;
     static constexpr bool SkipALds              = false;
@@ -44,14 +44,14 @@ template <bool kPadM_,
           bool kPadN_,
           bool kPadK_,
           bool DoubleSmemBuffer_,
-          typename ALayout_,
-          typename BLayout_,
+          typename AsLayout_,
+          typename BsLayout_,
           typename CLayout_,
           bool TransposeC_            = false,
           bool UseStructuredSparsity_ = false,
           bool UsePersistentKernel_   = false,
           index_t NumWaveGroups_      = 1,
-          bool Preshuffle_            = 0,
+          bool Preshuffle_            = false,
           bool SkipALds_              = false,
           bool SkipBLds_              = false>
 struct TileGemmUniversalTraits
@@ -62,9 +62,9 @@ struct TileGemmUniversalTraits
     static constexpr int _VectorSize       = 16;
     static constexpr bool DoubleSmemBuffer = DoubleSmemBuffer_;
 
-    using ALayout = ALayout_;
-    using BLayout = BLayout_;
-    using CLayout = CLayout_;
+    using AsLayout = AsLayout_;
+    using BsLayout = BsLayout_;
+    using CLayout  = CLayout_;
 
     static constexpr bool TransposeC = TransposeC_;
     /// @brief SkipALds_ Parameterize the data flow for matrix A: if true, global memory ->
@@ -83,8 +83,8 @@ template <bool kPadM_,
           bool kPadN_,
           bool kPadK_,
           bool DoubleSmemBuffer_,
-          typename ALayout_,
-          typename BLayout_,
+          typename AsLayout_,
+          typename BsLayout_,
           typename CLayout_,
           bool TransposeC_            = false,
           bool UseStructuredSparsity_ = false>
@@ -92,8 +92,8 @@ using PersistentTileGemmUniversalTraits = TileGemmUniversalTraits<kPadM_,
                                                                   kPadN_,
                                                                   kPadK_,
                                                                   DoubleSmemBuffer_,
-                                                                  ALayout_,
-                                                                  BLayout_,
+                                                                  AsLayout_,
+                                                                  BsLayout_,
                                                                   CLayout_,
                                                                   TransposeC_,
                                                                   UseStructuredSparsity_,
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
index f28208df52..f1c8f2ec9b 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
@@ -89,14 +89,19 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
     CK_TILE_HOST_DEVICE static constexpr auto GetKBPerLoad()
     {
         using TileShape = typename Problem::BlockGemmShape;
+#if defined(__gfx11__)
+        constexpr index_t scale = 4;
+#else
+        constexpr index_t scale = get_warp_size() == 32 ? 2 : 1;
+#endif
         if constexpr(TileShape::WarpTile::at(I1) == 32)
         {
-            return TileShape::WarpTile::at(I2) / 2;
+            return TileShape::WarpTile::at(I2) * scale / 2;
         }
         else
         {
             static_assert(TileShape::WarpTile::at(I1) == 16);
-            return TileShape::WarpTile::at(I2) / 4;
+            return TileShape::WarpTile::at(I2) * scale / 4;
         }
     }
 
@@ -192,7 +197,7 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
+    CK_TILE_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
     {
         using TileShape = typename Problem::BlockGemmShape;
 
@@ -200,8 +205,13 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
         constexpr index_t WaveSize  = get_warp_size();
         constexpr index_t WaveNum   = BlockSize / WaveSize;
 
-        constexpr index_t KBPerLoad   = GetKBPerLoad<Problem>();
-        constexpr index_t KThdPerWave = WaveSize; // threads cnt in K dim
+        constexpr index_t KBPerLoad = GetKBPerLoad<Problem>();
+#if defined(__gfx11__)
+        constexpr index_t KRepeatInWave = 2;
+#else
+        constexpr index_t KRepeatInWave = 1;
+#endif
+        constexpr index_t KThdPerWave = WaveSize / KRepeatInWave; // threads cnt in K dim
         constexpr index_t KWavePerBlk = 1;
         constexpr index_t KRepeat     = 1;
         static_assert(TileShape::flatKPerWarp == KThdPerWave * KBPerLoad, "wrong");
@@ -212,16 +222,15 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
         constexpr index_t NRepeat     = 1;
 
         constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
-
         return make_static_tile_distribution(
             tile_distribution_encoding<
-                sequence<WaveRepeat>,                                          // ?
+                sequence<WaveRepeat, KRepeatInWave>,                           // ?
                 tuple<sequence<NRepeat, NWavePerBlk, NThdPerWave, NBPerLoad>,  // second direction
                       sequence<KRepeat, KWavePerBlk, KThdPerWave, KBPerLoad>>, // first  direction
                 // wave in blk,     // thd in wave
                 // <M, K>           // <M, K>
-                tuple<sequence<0, 1, 2>, sequence<1, 2>>, // which direction
-                tuple<sequence<0, 1, 1>, sequence<2, 2>>, // which index
+                tuple<sequence<0, 1, 2>, sequence<0, 1, 2>>, // which direction
+                tuple<sequence<0, 1, 1>, sequence<1, 2, 2>>, // which index
                 // <repeat, vec_load>
                 sequence<1, 1, 2, 2>,
                 sequence<0, 3, 0, 3>>{});
@@ -280,13 +289,17 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
     {
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm   = WarpGemmDispatcher<typename Problem::ADataType,
-                                              typename Problem::BDataType,
-                                              typename Problem::CDataType,
-                                              WarpTile::at(I0),
-                                              WarpTile::at(I1),
-                                              WarpTile::at(I2),
-                                              Problem::TransposeC>;
+        using BTypeToUse =
+            std::conditional_t<std::is_same_v<typename Problem::BDataType, ck_tile::pk_int4_t>,
+                               typename Problem::ADataType,
+                               typename Problem::BDataType>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::ADataType,
+                                            BTypeToUse,
+                                            typename Problem::CDataType,
+                                            WarpTile::at(I0),
+                                            WarpTile::at(I1),
+                                            WarpTile::at(I2),
+                                            Problem::TransposeC>;
 
         using BlockWeightPreshufflePolicy =
             BlockWeightPreshuffleASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
@@ -296,6 +309,73 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
                                                               WarpGemm>;
         return BlockWeightPreshuffleASmemBSmemCRegV1<Problem, BlockWeightPreshufflePolicy>{};
     }
+    /**
+     * @brief Get the vector store size for C tensor.
+     *
+     * @tparam Problem - Gemm pipeline problem class.
+     *
+     * @note The vector store size for output C tensor would depend on multiple factors
+     *       like its data layout and warp gemm C transposition. In general it would
+     *       be the number of consecutive elements in contiguous C dimension hold by
+     *       single thread.
+     *
+     * @return The vector store size for C tensor.
+     */
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeC()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetBlockWeightPreshuffle<Problem>())>;
+        using WG_       = typename BlockGemm::WG;
+
+        constexpr bool TransposeC = Problem::TransposeC;
+        using CLayout             = typename Problem::CLayout;
+        using CWarpDstr           = typename WG_::CWarpDstr;
+
+        // N is contiguous dimension
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if constexpr(TransposeC)
+            {
+                // In this case each thread has multiple consecutive elements in
+                // N dimension, however consecutive threads' elements have stride.
+                constexpr index_t NDimY = CWarpDstr::NDimY;
+                constexpr auto c_warp_y_lengths =
+                    CWarpDstr{}.get_ys_to_d_descriptor().get_lengths();
+                static_assert(WG_::WarpGemmAttribute::Impl::kCM1PerLane ==
+                              c_warp_y_lengths.get(number<NDimY - 1>{}));
+                return c_warp_y_lengths.get(number<NDimY - 1>{});
+            }
+            else
+            {
+                // In this case each thread has just a single item in Ndim
+                return WG_::WarpGemmAttribute::Impl::kCNLane / WG_::kN;
+            }
+        }
+        // M is contiguous dimension
+        else if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::ColumnMajor>)
+        {
+            if constexpr(TransposeC)
+            {
+                // In this case each thread has just a single item in Mdim
+                return WG_::WarpGemmAttribute::Impl::kCNLane / WG_::kN;
+            }
+            else
+            {
+                // In this case each thread has multiple consecutive elements in
+                // M dimension, however consecutive threads' elements have stride.
+                constexpr index_t NDimY = CWarpDstr::NDimY;
+                constexpr auto c_warp_y_lengths =
+                    CWarpDstr{}.get_ys_to_d_descriptor().get_lengths();
+                static_assert(WG_::WarpGemmAttribute::Impl::kCM1PerLane ==
+                              c_warp_y_lengths.get(number<NDimY - 1>{}));
+                return c_warp_y_lengths.get(number<NDimY - 1>{});
+            }
+        }
+        else
+        {
+            static_assert(false, "Unsupported CLayout!");
+        }
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
index b91c211d91..7095b4bd23 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
 #include "ck_tile/host/concat.hpp"
 #include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp"
 
@@ -37,15 +38,24 @@ template <typename Problem, typename PipelinePolicy = UniversalWeightPreshuffleP
 struct WeightPreshufflePipelineAGmemBGmemCRegV1
     : public BaseWeightPreshufflePipelineAGmemBGmemCRegV1<Problem>
 {
-    using Base           = BaseWeightPreshufflePipelineAGmemBGmemCRegV1<Problem>;
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using Base       = BaseWeightPreshufflePipelineAGmemBGmemCRegV1<Problem>;
+    using AsDataType = remove_cvref_t<typename Problem::AsDataTypeTuple>;
+    using BsDataType = remove_cvref_t<typename Problem::BsDataTypeTuple>;
+    using CDataType  = remove_cvref_t<typename Problem::CDataType>;
+
+    using AElementWise   = remove_cvref_t<typename Problem::AElementWise>;
+    using BElementWise   = remove_cvref_t<typename Problem::BElementWise>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
-    using ALayout = remove_cvref_t<typename Problem::ALayout>;
-    using BLayout = remove_cvref_t<typename Problem::BLayout>;
-    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+    using AsLayout = remove_cvref_t<typename Problem::AsLayoutTuple>;
+    using BsLayout = remove_cvref_t<typename Problem::BsLayoutTuple>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using ALayout = remove_cvref_t<std::tuple_element_t<0, AsLayout>>;
+    using BLayout = remove_cvref_t<std::tuple_element_t<0, BsLayout>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
 
     using BlockWeightPreshuffle =
         remove_cvref_t<decltype(PipelinePolicy::template GetBlockWeightPreshuffle<Problem>())>;
@@ -188,12 +198,18 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
         }
     }
 
-    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
-    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                        const AElementFunction& a_element_func,
-                                        const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
-                                        index_t num_loop,
-                                        void* p_smem) const
+    template <typename ADramBlockWindowTmp,
+              typename BFlatBlockWindowTmp,
+              typename AElementFunction,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BFlatBlockWindowTmp>::value,
+                                        bool>* = nullptr,
+              index_t UnaryOpSize_             = 8>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
     {
         static_assert(
             std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
@@ -296,14 +312,14 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
             NIterPerWarp>
             b_flat_dram_windows;
 
-        statically_indexed_array<
-            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
-            NIterPerWarp>
+        using BTypeToUse =
+            std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
+        using BTileType = decltype(make_static_distributed_tensor<BTypeToUse>(b_flat_distribution));
+
+        statically_indexed_array<statically_indexed_array<BTileType, KIterPerWarp>, NIterPerWarp>
             b_warp_tensor;
 
-        statically_indexed_array<
-            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
-            NIterPerWarp>
+        statically_indexed_array<statically_indexed_array<BTileType, KIterPerWarp>, NIterPerWarp>
             b_warp_tensor_2;
 
         static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
@@ -313,7 +329,8 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
                 move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                  {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                b_warp_tensor(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                    b_warp_tensor(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
             });
         });
 
@@ -361,7 +378,8 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
                     move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                      {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                    b_warp_tensor_2(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                        b_warp_tensor_2(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                 });
             });
 
@@ -394,7 +412,8 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
                     move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                      {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                    b_warp_tensor(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                        b_warp_tensor(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                 });
             });
 
@@ -431,7 +450,8 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
                     move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                      {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                    b_warp_tensor_2(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                        b_warp_tensor_2(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                 });
             });
 
@@ -455,7 +475,33 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
         return c_block_tile;
     }
 
-    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp>
+    template <typename ADramBlockWindowTmp,
+              typename BFlatBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction,
+              typename std::enable_if_t<is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BFlatBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   [[maybe_unused]] const AElementFunction& a_element_func,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   [[maybe_unused]] const BElementFunction& b_element_func,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(
+            a_dram_block_window_tmp[number<0>{}],
+            [](const ADataType & a) { return a; },
+            b_flat_dram_block_window_tmp[number<0>{}],
+            num_loop,
+            p_smem);
+    }
+
+    template <typename ADramBlockWindowTmp,
+              typename BFlatBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BFlatBlockWindowTmp>::value,
+                                        bool>* = nullptr>
     CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
                                    const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
                                    index_t num_loop,
@@ -463,7 +509,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
     {
         return operator()(
             a_dram_block_window_tmp,
-            [](const ADataType & a) { return a; },
+            [](auto& e, const ADataType & a) { e = a; },
             b_flat_dram_block_window_tmp,
             num_loop,
             p_smem);
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
index c507d8d8d8..670f4b0575 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
 #include "ck_tile/host/concat.hpp"
 #include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp"
 
@@ -24,7 +25,7 @@ struct BaseWeightPreshufflePipelineAGmemBGmemCRegV2
         return num_loop > PrefetchStages;
     }
 
-    CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
     {
         return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
     }
@@ -35,12 +36,15 @@ struct BaseWeightPreshufflePipelineAGmemBGmemCRegV2
     {
         if(tail_number == TailNumber::Odd)
         {
-            run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Odd>{});
+            return run_func(bool_constant<true>{},
+                            integral_constant<TailNumber, TailNumber::Odd>{});
         }
-        else if(tail_number == TailNumber::Even)
+        else // Even tail number
         {
-            run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Even>{});
+            return run_func(bool_constant<true>{},
+                            integral_constant<TailNumber, TailNumber::Even>{});
         }
+        return run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Empty>{});
     }
 };
 
@@ -50,14 +54,23 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
 {
     using Base = BaseWeightPreshufflePipelineAGmemBGmemCRegV2<Problem>;
 
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using AsDataType = remove_cvref_t<typename Problem::AsDataTypeTuple>;
+    using BsDataType = remove_cvref_t<typename Problem::BsDataTypeTuple>;
+    using CDataType  = remove_cvref_t<typename Problem::CDataType>;
+
+    using AElementWise   = remove_cvref_t<typename Problem::AElementWise>;
+    using BElementWise   = remove_cvref_t<typename Problem::BElementWise>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>; // TileFlatmmShape
 
-    using ALayout = remove_cvref_t<typename Problem::ALayout>;
-    using BLayout = remove_cvref_t<typename Problem::BLayout>;
-    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+    using AsLayout = remove_cvref_t<typename Problem::AsLayoutTuple>;
+    using BsLayout = remove_cvref_t<typename Problem::BsLayoutTuple>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using ALayout = remove_cvref_t<std::tuple_element_t<0, AsLayout>>;
+    using BLayout = remove_cvref_t<std::tuple_element_t<0, BsLayout>>;
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
 
     using BlockWeightPreshuffle =
         remove_cvref_t<decltype(PipelinePolicy::template GetBlockWeightPreshuffle<Problem>())>;
@@ -67,12 +80,21 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
 
     using WG = remove_cvref_t<decltype(config.template at<0>())>;
 
+    static constexpr index_t DsWritePreIssue = 3; // default 2, ds write at MIter - 2
+    static constexpr index_t DsReadPreload   = 2; // default 2, preload 2 ds read
+
     static constexpr index_t BlockSize = Problem::kBlockSize;
+    static constexpr index_t WaveSize  = get_warp_size();
 
     static constexpr index_t kMPerBlock = BlockGemmShape::kM;
     static constexpr index_t kNPerBlock = BlockGemmShape::kN;
     static constexpr index_t kKPerBlock = BlockGemmShape::kK;
 
+    // bogus variables to compile grouped gemm (to be removed)
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
     static constexpr index_t flatKPerWarp = BlockGemmShape::flatKPerWarp;
     static constexpr index_t flatNPerWarp = BlockGemmShape::flatNPerWarp;
 
@@ -87,6 +109,11 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
         return PipelinePolicy::template GetVectorSizeB<Problem, IsWave32Host>();
     }
 
+    static constexpr index_t GetVectorSizeC()
+    {
+        return PipelinePolicy::template GetVectorSizeC<Problem>();
+    }
+
     static constexpr bool kPadM = Problem::kPadM;
     static constexpr bool kPadN = Problem::kPadN;
     static constexpr bool kPadK = Problem::kPadK;
@@ -117,13 +144,37 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
     static constexpr index_t MPerBlockPerIter = kMPerBlock / MIterPerWarp;
     static constexpr index_t KPerBlockPerIter = kKPerBlock / KIterPerWarp;
 
-    static constexpr index_t K1           = Problem::VectorLoadSize / sizeof(ADataType);
-    static constexpr index_t ACopyLoadNum = kMPerBlock * kKPerBlock / BlockSize / K1;
-    static constexpr auto TailNum         = Problem::TailNum;
+    static constexpr index_t K1        = Problem::VectorLoadSize / sizeof(ADataType);
+    static constexpr index_t m_preload = (MIterPerWarp * KIterPerWarp >= DsReadPreload)
+                                             ? DsReadPreload
+                                             : MIterPerWarp * KIterPerWarp;
+    static constexpr auto TailNum      = Problem::TailNum;
 
-    static constexpr auto warp_m = WarpTile::at(idxM);
-    static constexpr auto warp_n = WarpTile::at(idxN);
-    static constexpr auto warp_k = WarpTile::at(idxK);
+#ifdef __gfx942__
+    static constexpr index_t mfma_per_wg = 2;
+#else
+    static constexpr index_t mfma_per_wg = 1;
+#endif
+    static constexpr index_t dsread_per_wg =
+        max(index_t(WG::kM * WG::kK * sizeof(ADataType) / WaveSize / Problem::VectorLoadSize), 1);
+#if defined(__HIP_DEVICE_COMPILE__)
+    static_assert((WG::kM * WG::kK * sizeof(ADataType) * MIterPerWarp / WaveSize) %
+                      Problem::VectorLoadSize ==
+                  0);
+#endif
+    static constexpr index_t dsread_num_perK =
+        WG::kM * WG::kK * sizeof(ADataType) * MIterPerWarp / WaveSize / Problem::VectorLoadSize;
+    static constexpr index_t dswrite_num_perK = dsread_num_perK / (MWarp * NWarp);
+    static constexpr index_t dswrite_rep    = (dswrite_num_perK + MIterPerWarp - 1) / MIterPerWarp;
+    static constexpr index_t Aload_num_perK = dswrite_num_perK;
+    static constexpr index_t Aload_rep      = dswrite_rep;
+    static constexpr index_t Bload_num_perK = kNPerBlock * WG::kK / NWarp / K1 / WaveSize;
+    static constexpr index_t HalfMIter      = (MIterPerWarp + 1) / 2;
+    static constexpr index_t Bload_rep      = (Bload_num_perK + HalfMIter - 1) / HalfMIter;
+
+    static constexpr index_t mfma_perM_perK = NIterPerWarp * mfma_per_wg;
+    static constexpr index_t dswrite_mIter  = (DsWritePreIssue - 1) % MIterPerWarp;
+    static constexpr index_t dswrite_kIter  = (DsWritePreIssue - 1) / MIterPerWarp;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
@@ -148,420 +199,330 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
         return PipelinePolicy::template GetSmemSize<Problem>();
     }
 
-    CK_TILE_HOST_DEVICE static constexpr auto HotLoopScheduler()
+    // dsread_perM: how many LDS reads want to issue in this M-iter
+    // dswrite_perM: how many LDS writes you want to do this M-iter
+    // load_perM: how many global loads VMEM want to do in this M-iter
+    CK_TILE_HOST_DEVICE static constexpr auto
+    SchedulerPerM(index_t dsread_perM, index_t dswrite_perM, index_t load_perM)
     {
 
-        constexpr index_t KPerLoad               = Problem::VectorLoadSize / sizeof(ADataType);
-        constexpr index_t A_Buffer_Load_Inst_Num = kMPerBlock * kKPerBlock / BlockSize / KPerLoad;
-        constexpr index_t A_LDS_Read_Inst_Num    = MIterPerWarp * KIterPerWarp;
-        constexpr index_t B_Buffer_Load_Inst_Num = NIterPerWarp * KIterPerWarp;
+        // Init inst order
+        index_t max_data_inst   = dsread_perM > load_perM
+                                      ? (dsread_perM > dswrite_perM ? dsread_perM : dswrite_perM)
+                                      : (load_perM > dswrite_perM ? load_perM : dswrite_perM);
+        index_t sum_data_inst   = dsread_perM + load_perM + dswrite_perM;
+        index_t round_data_inst = ck_tile::integer_divide_ceil(sum_data_inst, mfma_perM_perK);
 
-        // Keypoint of pipeline optimize is workload balance in time
-        // instruction schedule example(128X256X256, 1X4, 16X16X128):
-        // Iter MNK     MFMA    ds_read ds_write    A_load  b_load
-        // -1   M6N3:   60      2       -           -       -
-        // -1   M7N0:   61      -       -           -       -
-        // -1   M7N1:   62      -       -           -       -
-        // -1   M7N2:   63      -       -           -       -
-        // -1   M7N3:   64      4       -           -       -
-        //  0   M0N0K0:  1      -       -           -       -
-        //  0   M0N1:    2      -       -           -       2
-        //  0   M0N2:    3      -       -           -       -
-        //  0   M0N3:    4      6       -           -       -
-        //  0   M1N0:    5      -       -           -       -
-        //  0   M1N1:    6      -       -           -       4
-        //  0   M1N2:    7      -       -           -       -
-        //  0   M1N3:    8      8       -           -       -
-        //  0   M2N0:    9      -       -           -       -
-        //  0   M2N1:   10      -       -           -       6
-        //  0   M2N2:   11      -       -           -       -
-        //  0   M2N3:   12     10       -           -       -
-        //  0   M3N0:   13      -       1           -       -
-        //  0   M3N1:   14      -       -           -       8
-        //  0   M3N2:   15      -       -           -       -
-        //  0   M3N3:   16     12       -           -       -
-        //  0   M4N0:   17      -       2           -       -
-        //  0   M4N1:   18      -       -           -       -
-        //  0   M4N2:   19      -       -           1       -
-        //  0   M4N3:   20     14       -           -       -
-        //  0   M5N0:   21      -       3           -       -
-        //  0   M5N1:   22      -       -           -       -
-        //  0   M5N2:   23      -       -           2       -
-        //  0   M5N3:   24     16       -           -       -
-        //  0   M6N0:   25      -       4           -       -
-        //  0   M6N1:   26      -       -           -       -
-        //  0   M6N2:   27      -       -           3       -
-        //  0   M6N3:   28     17       -           -       -
-        //  0   M7N0:   29      -       -           -       -
-        //  0   M7N1:   30      -       -           -       -
-        //  0   M7N2:   31      -       -           4       -
-        //  0   M7N3:   32     18       -           -       -
-        //  0   M0N0K1: 33      -       -           -       -
-        //  0   M0N1:   34      -       -           -       10
-        //  0   M0N2:   35      -       -           -       -
-        //  0   M0N3:   36     20       -           -       -
-        //  0   M1N0:   37      -       -           -       -
-        //  0   M1N1:   38      -       -           -       12
-        //  0   M1N2:   39      -       -           -       -
-        //  0   M1N3:   40     22       -           -       -
-        //  0   M2N0:   41      -       -           -       -
-        //  0   M2N1:   42      -       -           -       14
-        //  0   M2N2:   43      -       -           -       -
-        //  0   M2N3:   44     24       -           -       -
-        //  0   M3N0:   45      -       5           -       -
-        //  0   M3N1:   46      -       -           -       16
-        //  0   M3N2:   47      -       -           -       -
-        //  0   M3N3:   48     26       -           -       -
-        //  0   M4N0:   49      -       6           -       -
-        //  0   M4N1:   50      -       -           -       -
-        //  0   M4N2:   51      -       -           5       -
-        //  0   M4N3:   52     28       -           -       -
-        //  0   M5N0:   53      -       7           -       -
-        //  0   M5N1:   54      -       -           -       -
-        //  0   M5N2:   55      -       -           6       -
-        //  0   M5N3:   56     30       -           -       -
-        //  0   M6N0:   57      -       8           -       -
-        //  0   M6N1:   58      -       -           -       -
-        //  0   M6N2:   59      -       -           7       -
-        //  0   M6N3:   60      2       -           -       -
-        //  0   M7N0:   61      -       -           -       -
-        //  0   M7N1:   62      -       -           -       -
-        //  0   M7N2:   63      -       -           8       -
-        //  0   M7N3:   64      4       -           -       -
-
-        if constexpr(warp_m == 16 && warp_n == 16)
+        constexpr int kOrderCap       = NIterPerWarp * 10;
+        index_t inst_order[kOrderCap] = {};
+        index_t index                 = 0;
+#pragma unroll
+        // round-robin
+        // Index:   0 1 2 3 4 5 ...
+        // Value:   1 2 3 1 2 3 ...
+        for(int j = 0; j < max_data_inst; j++)
         {
-// MFMA -> VMEM READ -> MFMA -> DS Read -> MFMA
-// hiding the glbal memory VMEM latency
-#if defined(__gfx950__)
-            if constexpr(kMPerBlock == 128 && kNPerBlock == 256 && kKPerBlock == 256)
+            if(dswrite_perM > j)
             {
-                static_for<0, 2, 1>{}([&](auto j) {
-                    ignore = j;
-                    static_for<0, 3, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-
-                    static_for<0, 3, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                });
-
-                __builtin_amdgcn_sched_barrier(0);
+                inst_order[index] = 1;
+                index++;
             }
-            else
+            if(load_perM > j)
             {
-                static_for<0, 2, 1>{}([&](auto j) {
-                    ignore = j;
-                    static_for<0, 3, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-
-                    static_for<0, 3, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    });
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                });
-
-                __builtin_amdgcn_sched_barrier(0);
+                inst_order[index] = 2;
+                index++;
             }
-// MFMA → MFMA → MFMA → MFMA → DS Read
-// For other device engine we need more aggressive MFMA with DS writes interleaved
-#else
-            if constexpr(kMPerBlock == 128 && kNPerBlock == 256 && kKPerBlock == 256)
+            if(dsread_perM > j)
             {
-                static_for<0, 2, 1>{}([&](auto j) {
-                    ignore = j;
-                    // Uses loops to amortize scheduling overhead
-                    static_for<0, 4, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    static_for<0, 1, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-
-                    static_for<0, 1, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    static_for<0, 1, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    static_for<0, 1, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                });
-
-                __builtin_amdgcn_sched_barrier(0);
+                inst_order[index] = 3;
+                index++;
             }
-            else if constexpr(kMPerBlock == 16 && kNPerBlock == 64 && kKPerBlock == 256)
-            {
-                static_for<0, 1, 1>{}([&](auto i) {
-                    ignore = i;
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                });
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                __builtin_amdgcn_sched_barrier(0);
-            }
-            else if constexpr(kMPerBlock == 128 && kNPerBlock == 128 && kKPerBlock == 128)
-            {
-                // prioritize MFMA to avoid LDS write conflicts
-                static_for<0, 2, 1>{}([&](auto j) {
-                    ignore = j;
-                    static_for<0, 2, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    static_for<0, 2, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    static_for<0, 1, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    static_for<0, 1, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    static_for<0, 1, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                    static_for<0, 1, 1>{}([&](auto i) {
-                        ignore = i;
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    });
-                });
-
-                __builtin_amdgcn_sched_barrier(0);
-            }
-            else
-            {
-                static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-                    ignore = i;
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                });
-                static_for<0, A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-                    ignore = i;
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
-                });
-                static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-                    ignore = i;
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                });
-                static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-                    ignore = i;
-                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                    __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
-                });
-            }
-
-#endif
         }
-        else
+
+// Schedule IGLP
+#pragma unroll
+        for(int j = 0; j < mfma_perM_perK; j++)
         {
-            if constexpr((A_LDS_Read_Inst_Num / 2 >
-                          A_Buffer_Load_Inst_Num + B_Buffer_Load_Inst_Num))
+            index_t inst_idx = 0;
+            if(j == 0)
+                ;
+            else if(j == 1)
+                inst_idx = mfma_perM_perK == 2 ? 1 : mfma_perM_perK - 2;
+            else if(j == 2)
+                inst_idx = mfma_perM_perK - 1;
+            else
+                inst_idx = mfma_perM_perK - j;
+
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+#pragma unroll
+            for(int r = 0; r < round_data_inst; r++)
             {
-                static_for<0,
-                           A_LDS_Read_Inst_Num / 2 - A_Buffer_Load_Inst_Num -
-                               B_Buffer_Load_Inst_Num,
-                           1>{}([&](auto i) {
-                    ignore = i;
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                });
+                if(r % 2 == 0)
+                {
+                    if(inst_order[inst_idx + r * mfma_perM_perK] == 1)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                    if(inst_order[inst_idx + r * mfma_perM_perK] == 2)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+                    if(inst_order[inst_idx + r * mfma_perM_perK] == 3)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                }
+                else
+                {
+                    if(inst_order[(r + 1) * mfma_perM_perK - 1 - inst_idx] == 1)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                    if(inst_order[(r + 1) * mfma_perM_perK - 1 - inst_idx] == 2)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+                    if(inst_order[(r + 1) * mfma_perM_perK - 1 - inst_idx] == 3)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                }
             }
-            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            });
-            static_for<0, A_LDS_Read_Inst_Num / 2, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            });
-            static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            });
-            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-                __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
-            });
-            __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
         }
     }
 
-    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
-    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                        const AElementFunction& a_element_func,
-                                        const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
-                                        index_t num_loop,
-                                        void* p_smem_ping,
-                                        void* p_smem_pong) const
+    CK_TILE_HOST_DEVICE static constexpr auto HotLoopScheduler()
+    {
+        // Keypoint of pipeline optimize is workload balance in time
+        // instruction schedule example(128X256X256, 1X4, 16X16X128):
+        // Iter MNK     MFMA    ds_read ds_write    A_load  b_load
+        // -1   M6N0:   57      -       8           -       -
+        // -1   M6N1:   58      1       -           -       -
+        // -1   M6N2:   59      -       -           7       -
+        // -1   M6N3:   60      2       -           -       -
+        // -1   M7N0:   61      -       -           -       -
+        // -1   M7N1:   62      3       -           -       -
+        // -1   M7N2:   63      -       -           8       -
+        // -1   M7N3:   64      4       -           -       -
+        //  0   M0N0K0:  1      -       -           -       1
+        //  0   M0N1:    2      5       -           -       -
+        //  0   M0N2:    3      -       -           -       2
+        //  0   M0N3:    4      6       -           -       -
+        //  0   M1N0:    5      -       -           -       3
+        //  0   M1N1:    6      7       -           -       -
+        //  0   M1N2:    7      -       -           -       4
+        //  0   M1N3:    8      8       -           -       -
+        //  0   M2N0:    9      -       -           -       5
+        //  0   M2N1:   10      9       -           -       -
+        //  0   M2N2:   11      -       -           -       6
+        //  0   M2N3:   12     10       -           -       -
+        //  0   M3N0:   13      -       1           -       7
+        //  0   M3N1:   14     11       -           -       -
+        //  0   M3N2:   15      -       -           -       8
+        //  0   M3N3:   16     12       -           -       -
+        //  0   M4N0:   17      -       2           -       -
+        //  0   M4N1:   18     13       -           -       -
+        //  0   M4N2:   19      -       -           1       -
+        //  0   M4N3:   20     14       -           -       -
+        //  0   M5N0:   21      -       3           -       -
+        //  0   M5N1:   22     15       -           -       -
+        //  0   M5N2:   23      -       -           2       -
+        //  0   M5N3:   24     16       -           -       -
+        //  0   M6N0:   25      -       4           -       -
+        //  0   M6N1:   26     17       -           -       -
+        //  0   M6N2:   27      -       -           3       -
+        //  0   M6N3:   28     18       -           -       -
+        //  0   M7N0:   29      -       -           -       -
+        //  0   M7N1:   30     19       -           -       -
+        //  0   M7N2:   31      -       -           4       -
+        //  0   M7N3:   32     20       -           -       -
+        //  0   M0N0K1: 33      -       -           -       9
+        //  0   M0N1:   34     21       -           -       -
+        //  0   M0N2:   35      -       -           -       10
+        //  0   M0N3:   36     22       -           -       -
+        //  0   M1N0:   37      -       -           -       11
+        //  0   M1N1:   38     23       -           -       -
+        //  0   M1N2:   39      -       -           -       12
+        //  0   M1N3:   40     24       -           -       -
+        //  0   M2N0:   41      -       -           -       13
+        //  0   M2N1:   42     25       -           -       -
+        //  0   M2N2:   43      -       -           -       14
+        //  0   M2N3:   44     26       -           -       -
+        //  0   M3N0:   45      -       5           -       15
+        //  0   M3N1:   46     27       -           -       -
+        //  0   M3N2:   47      -       -           -       16
+        //  0   M3N3:   48     28       -           -       -
+        //  0   M4N0:   49      -       6           -       -
+        //  0   M4N1:   50     29       -           -       -
+        //  0   M4N2:   51      -       -           5       -
+        //  0   M4N3:   52     30       -           -       -
+        //  0   M5N0:   53      -       7           -       -
+        //  0   M5N1:   54     31       -           -       -
+        //  0   M5N2:   55      -       -           6       -
+        //  0   M5N3:   56     32       -           -       -
+        //  0   M6N0:   57      -       8           -       -
+        //  0   M6N1:   58      1       -           -       -
+        //  0   M6N2:   59      -       -           7       -
+        //  0   M6N3:   60      2       -           -       -
+        //  0   M7N0:   61      -       -           -       -
+        //  0   M7N1:   62      3       -           -       -
+        //  0   M7N2:   63      -       -           8       -
+        //  0   M7N3:   64      4       -           -       -
+
+#pragma unroll
+        for(int kIter = 0; kIter < KIterPerWarp; kIter++)
+        {
+#pragma unroll
+            for(int mIter = 0; mIter < MIterPerWarp; mIter++)
+            {
+                index_t dsread_perM  = 0;
+                index_t dswrite_perM = 0;
+                index_t load_perM    = 0;
+
+                // Calculate ds_read number per M
+                dsread_perM = dsread_per_wg;
+
+                // Calculate ds_write number per M
+                if(mIter == 0)
+                {
+                    dswrite_perM =
+                        (dswrite_num_perK - (MIterPerWarp - DsWritePreIssue) * dswrite_rep) > 0
+                            ? dswrite_num_perK - (MIterPerWarp - DsWritePreIssue) * dswrite_rep
+                            : 0;
+                }
+                else if(mIter >= MIterPerWarp - DsWritePreIssue + 1)
+                {
+                    dswrite_perM = 0;
+                }
+                else
+                {
+                    dswrite_perM = (dswrite_num_perK -
+                                    (MIterPerWarp - DsWritePreIssue - mIter) * dswrite_rep) > 0
+                                       ? dswrite_rep
+                                       : 0;
+                }
+                // Add ds write when ds write data > needed
+                if(dswrite_num_perK == 0 && kIter == (KIterPerWarp - 1 - dswrite_kIter))
+                {
+                    if(mIter == MIterPerWarp - 1 - dswrite_mIter)
+                        dswrite_perM = 1;
+                }
+
+                // Calculate buffer_load number per M
+                if(mIter < HalfMIter)
+                {
+                    load_perM =
+                        ((Aload_num_perK - (MIterPerWarp - 1 - mIter) * Aload_rep) > 0 ? Aload_rep
+                                                                                       : 0) +
+                        ((Bload_num_perK - (HalfMIter - 1 - mIter) * Bload_rep) > 0 ? Bload_rep
+                                                                                    : 0);
+                }
+                else
+                {
+                    load_perM = (Aload_num_perK - (MIterPerWarp - 1 - mIter) * Aload_rep) > 0
+                                    ? Aload_rep
+                                    : 0;
+                }
+                SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
+            }
+        }
+        // Add Aload when Aload data > needed
+        if(Aload_num_perK == 0)
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        __builtin_amdgcn_sched_barrier(0);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto Last2ndHotLoopScheduler()
+    {
+#pragma unroll
+        for(int kIter = 0; kIter < KIterPerWarp; kIter++)
+        {
+#pragma unroll
+            for(int mIter = 0; mIter < MIterPerWarp; mIter++)
+            {
+                index_t dsread_perM  = 0;
+                index_t dswrite_perM = 0;
+                index_t load_perM    = 0;
+
+                // Calculate ds_read number per M
+                dsread_perM = dsread_per_wg;
+
+                // Calculate ds_write number per M
+                if(mIter == 0)
+                {
+                    dswrite_perM =
+                        (dswrite_num_perK - (MIterPerWarp - DsWritePreIssue) * dswrite_rep) > 0
+                            ? dswrite_num_perK - (MIterPerWarp - DsWritePreIssue) * dswrite_rep
+                            : 0;
+                }
+                else if(mIter >= MIterPerWarp - DsWritePreIssue + 1)
+                {
+                    dswrite_perM = 0;
+                }
+                else
+                {
+                    dswrite_perM = (dswrite_num_perK -
+                                    (MIterPerWarp - DsWritePreIssue - mIter) * dswrite_rep) > 0
+                                       ? dswrite_rep
+                                       : 0;
+                }
+                // Add ds write when ds write data > needed
+                if(dswrite_num_perK == 0 && kIter == (KIterPerWarp - 1 - dswrite_kIter))
+                {
+                    if(mIter == MIterPerWarp - 1 - dswrite_mIter)
+                        dswrite_perM = 1;
+                }
+
+                // Calculate buffer_load number per M
+                if(mIter < HalfMIter)
+                {
+                    load_perM =
+                        ((Bload_num_perK - (HalfMIter - 1 - mIter) * Bload_rep) > 0 ? Bload_rep
+                                                                                    : 0);
+                }
+                SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
+            }
+        }
+        __builtin_amdgcn_sched_barrier(0);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto LastHotLoopScheduler()
+    {
+#pragma unroll
+        for(int kIter = 0; kIter < KIterPerWarp; kIter++)
+        {
+#pragma unroll
+            for(int mIter = 0; mIter < MIterPerWarp; mIter++)
+            {
+                index_t dsread_perM  = 0;
+                index_t dswrite_perM = 0;
+                index_t load_perM    = 0;
+
+                // Calculate ds_read number per M
+                if((kIter * MIterPerWarp + mIter) < (KIterPerWarp * MIterPerWarp - m_preload))
+                    dsread_perM = dsread_per_wg;
+
+                SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
+            }
+        }
+        // __builtin_amdgcn_sched_barrier(0);
+    }
+
+    template <TailNumber TailNum,
+              typename ADramBlockWindowTmp,
+              typename BFlatBlockWindowTmp,
+              typename AElementFunction,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BFlatBlockWindowTmp>::value,
+                                        bool>* = nullptr,
+              index_t UnaryOpSize_             = 8>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem_ping,
+                                   void* p_smem_pong) const
     {
         static_assert(
             std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>>,
@@ -672,19 +633,19 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                 b_flat_distribution);
 
         // pingpong buffer for B
+        using BTypeToUse =
+            std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
+        using BTileType = decltype(make_static_distributed_tensor<BTypeToUse>(b_flat_distribution));
+
         statically_indexed_array<
             statically_indexed_array<decltype(b_flat_dram_window), KIterPerWarp>,
             NIterPerWarp>
             b_flat_dram_windows;
 
-        statically_indexed_array<
-            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
-            NIterPerWarp>
+        statically_indexed_array<statically_indexed_array<BTileType, KIterPerWarp>, NIterPerWarp>
             b_warp_tensor_ping;
 
-        statically_indexed_array<
-            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
-            NIterPerWarp>
+        statically_indexed_array<statically_indexed_array<BTileType, KIterPerWarp>, NIterPerWarp>
             b_warp_tensor_pong;
 
         // Prefetch A0
@@ -700,7 +661,8 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                 move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                  {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                    b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
             });
         });
         // move B window to next flat K
@@ -723,22 +685,19 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
         block_sync_lds();
 
         // preload A00,A10 from lds
-        constexpr auto m_preload = (MIterPerWarp * KIterPerWarp >= 2) ? 2 : 1;
         statically_indexed_array<decltype(load_tile(a_warp_windows_ping(number<0>{})(number<0>{}))),
                                  m_preload>
-            a_warp_tensor_ping;
-        statically_indexed_array<decltype(load_tile(a_warp_windows_pong(number<0>{})(number<0>{}))),
-                                 m_preload>
-            a_warp_tensor_pong;
+            a_warp_tensor;
 
         static_for<0, m_preload, 1>{}([&](auto loadIter) {
             constexpr auto mIter = loadIter % MIterPerWarp;
             constexpr auto kIter = loadIter / MIterPerWarp;
-            a_warp_tensor_ping(loadIter) =
+            a_warp_tensor(loadIter) =
                 load_tile(a_warp_windows_ping(number<mIter>{})(number<kIter>{}));
         });
         __builtin_amdgcn_sched_barrier(0);
 
+        // MAIN LOOP
         index_t iCounter = (num_loop - 1) / 2;
         while(iCounter > 0)
         {
@@ -750,7 +709,8 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                     move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                      {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                        b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                 });
             });
 
@@ -777,7 +737,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
 
                         // warp GEMM
                         WG{}(c_warp_tensor,
-                             a_warp_tensor_ping(number<AwarpIter>{}),
+                             a_warp_tensor(number<AwarpIter>{}),
                              b_warp_tensor_ping(nIter)(kIter));
 
                         // write C warp tensor into C block tensor
@@ -794,7 +754,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                     {
                         constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
                         constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor_ping(number<AwarpIter>{}) =
+                        a_warp_tensor(number<AwarpIter>{}) =
                             load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
                     }
 
@@ -811,7 +771,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
             static_for<0, m_preload, 1>{}([&](auto loadIter) {
                 constexpr auto mIter = loadIter % MIterPerWarp;
                 constexpr auto kIter = loadIter / MIterPerWarp;
-                a_warp_tensor_pong(loadIter) =
+                a_warp_tensor(loadIter) =
                     load_tile(a_warp_windows_pong(number<mIter>{})(number<kIter>{}));
             });
             HotLoopScheduler();
@@ -826,7 +786,8 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                     move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                      {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                    b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                        b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                 });
             });
 
@@ -852,7 +813,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
 
                         // warp GEMM
                         WG{}(c_warp_tensor,
-                             a_warp_tensor_pong(number<AwarpIter>{}),
+                             a_warp_tensor(number<AwarpIter>{}),
                              b_warp_tensor_pong(nIter)(kIter));
 
                         // write C warp tensor into C block tensor
@@ -869,7 +830,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                     {
                         constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
                         constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor_pong(number<AwarpIter>{}) =
+                        a_warp_tensor(number<AwarpIter>{}) =
                             load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
                     }
 
@@ -886,7 +847,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
             static_for<0, m_preload, 1>{}([&](auto loadIter) {
                 constexpr auto mIter = loadIter % MIterPerWarp;
                 constexpr auto kIter = loadIter / MIterPerWarp;
-                a_warp_tensor_ping(loadIter) =
+                a_warp_tensor(loadIter) =
                     load_tile(a_warp_windows_ping(number<mIter>{})(number<kIter>{}));
             });
             HotLoopScheduler();
@@ -906,7 +867,8 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                     move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                      {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                        b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                 });
             });
 
@@ -928,7 +890,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
 
                         // warp GEMM
                         WG{}(c_warp_tensor,
-                             a_warp_tensor_ping(number<AwarpIter>{}),
+                             a_warp_tensor(number<AwarpIter>{}),
                              b_warp_tensor_ping(nIter)(kIter));
 
                         // write C warp tensor into C block tensor
@@ -945,7 +907,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                     {
                         constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
                         constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor_ping(number<AwarpIter>{}) =
+                        a_warp_tensor(number<AwarpIter>{}) =
                             load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
                     }
 
@@ -961,11 +923,11 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
             static_for<0, m_preload, 1>{}([&](auto loadIter) {
                 constexpr auto mIter = loadIter % MIterPerWarp;
                 constexpr auto kIter = loadIter / MIterPerWarp;
-                a_warp_tensor_pong(loadIter) =
+                a_warp_tensor(loadIter) =
                     load_tile(a_warp_windows_pong(number<mIter>{})(number<kIter>{}));
             });
 
-            // __builtin_amdgcn_sched_barrier(0);
+            Last2ndHotLoopScheduler();
 
             // GEMM loopK
             static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
@@ -981,7 +943,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
 
                         // warp GEMM
                         WG{}(c_warp_tensor,
-                             a_warp_tensor_pong(number<AwarpIter>{}),
+                             a_warp_tensor(number<AwarpIter>{}),
                              b_warp_tensor_pong(nIter)(kIter));
 
                         // write C warp tensor into C block tensor
@@ -989,19 +951,23 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                             merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
                             merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
                             c_warp_tensor.get_thread_buffer());
-                        __builtin_amdgcn_sched_barrier(0x7F6);
                     });
                     if constexpr((kIter * MIterPerWarp + mIter) <
                                  (KIterPerWarp * MIterPerWarp - m_preload))
                     {
                         constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
                         constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor_pong(number<AwarpIter>{}) =
+                        a_warp_tensor(number<AwarpIter>{}) =
                             load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
                     }
+                    // barrier
+                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                    {
+                        block_sync_lds();
+                    }
                 });
             });
-            // TailHotLoopScheduler();
+            LastHotLoopScheduler();
         }
         else if constexpr(TailNum == TailNumber::Odd)
         {
@@ -1019,7 +985,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
 
                         // warp GEMM
                         WG{}(c_warp_tensor,
-                             a_warp_tensor_ping(number<AwarpIter>{}),
+                             a_warp_tensor(number<AwarpIter>{}),
                              b_warp_tensor_ping(nIter)(kIter));
 
                         // write C warp tensor into C block tensor
@@ -1036,7 +1002,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                     {
                         constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
                         constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor_ping(number<AwarpIter>{}) =
+                        a_warp_tensor(number<AwarpIter>{}) =
                             load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
                     }
 
@@ -1047,26 +1013,84 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
                     }
                 });
             });
+            LastHotLoopScheduler();
         }
 
         return c_block_tile;
     }
 
-    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp>
+    // called from universal gemm kernel
+    template <typename ADramBlockWindowTmp,
+              typename BFlatBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction,
+              typename std::enable_if_t<is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            is_detected<is_tuple, BFlatBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   [[maybe_unused]] const AElementFunction& a_element_func,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   [[maybe_unused]] const BElementFunction& b_element_func,
+                                   index_t num_loop,
+                                   void* p_smem_ping,
+                                   void* p_smem_pong) const
+    {
+        return operator()<TailNum>(
+            a_dram_block_window_tmp[number<0>{}],
+            [](const ADataType& a) { return a; },
+            b_flat_dram_block_window_tmp[number<0>{}],
+            num_loop,
+            p_smem_ping,
+            p_smem_pong);
+    }
+
+    // called from general gemm kernel
+    template <typename ADramBlockWindowTmp,
+              typename BFlatBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BFlatBlockWindowTmp>::value,
+                                        bool>* = nullptr>
     CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
                                    const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
                                    index_t num_loop,
                                    void* p_smem_ping,
                                    void* p_smem_pong) const
     {
-        return operator()(
+        return operator()<TailNum>(
             a_dram_block_window_tmp,
-            [](const ADataType & a) { return a; },
+            [](const ADataType& a) { return a; },
             b_flat_dram_block_window_tmp,
             num_loop,
             p_smem_ping,
             p_smem_pong);
     }
+
+    // called from grouped gemm kernel
+    template <typename ADramBlockWindowTmp,
+              typename BFlatBlockWindowTmp,
+              typename std::enable_if_t<!is_detected<is_tuple, ADramBlockWindowTmp>::value &&
+                                            !is_detected<is_tuple, BFlatBlockWindowTmp>::value,
+                                        bool>* = nullptr>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   TailNumber tail_number,
+                                   void* __restrict__ p_smem_0,
+                                   void* __restrict__ p_smem_1) const
+    {
+        const auto RunPipeline = [&](auto bool_val, auto tail_num_) {
+            (void)bool_val; // Suppress unused parameter warning
+            constexpr auto tail_num    = tail_num_.value;
+            constexpr auto PassThrough = [](const auto& x) { return x; };
+            return operator()<tail_num>(a_dram_block_window_tmp,
+                                        PassThrough,
+                                        b_flat_dram_block_window_tmp,
+                                        num_loop,
+                                        p_smem_0,
+                                        p_smem_1);
+        };
+        return Base::TailHandler(RunPipeline, true, tail_number);
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 87772f78fc..21f21e1aa0 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -12,6 +12,24 @@
 
 namespace ck_tile {
 
+// fp32
+
+using WarpGemmMfmaF32F32F32M16N16K4 = WarpGemmImpl<
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF32F32F32M16N16K4<WGAttrCtlEnum::Default_>>>;
+
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfmaF32F32F32M16N16K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplF32F32F32M16N16K4<WGAttrCtlEnum::Default_>,
+    4,
+    AttrNumAccess>>;
+
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfmaF32F32F32M16N16K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF32F32F32M16N16K4<WGAttrCtlEnum::Default_>,
+        4,
+        AttrNumAccess>>;
+
 // fp16
 
 using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl<
@@ -301,6 +319,30 @@ using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<
     WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_16x16x128_fp8_fp8_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
+
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_16x16x128_fp8_bf8_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
+
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_16x16x128_bf8_fp8_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
+
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_16x16x128_bf8_bf8_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
+
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_fp8_fp8 = WarpGemmImpl<
     WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>,
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index 1f8b4f8adc..d66438528e 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -125,6 +125,7 @@ struct WarpGemmAttributeMfmaIterateK
     static constexpr index_t kN          = Impl::kN;
     static constexpr index_t kK          = Impl::kK * kKIter;
     static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter;
+    static constexpr index_t kCMLane     = Impl::kCMLane;
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 0831cf85c4..7528760439 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -61,6 +61,135 @@ enum class WGAttrCtlEnum
         DISPATCH_MFMA_(mfma_, "+a", "v", "v", "a")     \
     }
 
+// F32
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplF32F32F32M16N16K4
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+
+    using ADataType = float;
+    using BDataType = float;
+    using CDataType = float;
+
+    using AVecType = ext_vector_t<ADataType, 1>;
+    using BVecType = ext_vector_t<BDataType, 1>;
+    using CVecType = ext_vector_t<CDataType, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 4;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 1;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x4f32", Ctrl)
+        else
+        {
+#if defined(__gfx9__)
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x4f32(a_vec[0], b_vec[0], c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx9__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_16x16x4f32(a_vec[0], b_vec[0], CVecType{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplF32F32F32M32N32K2
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+
+    using ADataType = float;
+    using BDataType = float;
+    using CDataType = float;
+
+    using AVecType = ext_vector_t<ADataType, 1>;
+    using BVecType = ext_vector_t<BDataType, 1>;
+    using CVecType = ext_vector_t<CDataType, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 2;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 1;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x2f32", Ctrl)
+        else
+        {
+#if defined(__gfx9__)
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_vec[0], b_vec[0], c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx9__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_32x32x2f32(a_vec[0], b_vec[0], CVecType{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
 // V_MFMA_F32_16x16x32_BF16
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32
@@ -660,8 +789,20 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4
         DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4bf16_1k", Ctrl)
         else
         {
-#if defined(__gfx9__)
+#if defined(__gfx90a__) || defined(__gfx94__)
             c_vec = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+#elif defined(__gfx908__)
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_4x4x2bf16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
 #else
             ignore = c_vec;
             ignore = a_vec;
@@ -673,9 +814,23 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-#if defined(__gfx9__)
+#if defined(__gfx90a__) || defined(__gfx94__)
         return bit_cast<CVecType>(
             __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#elif defined(__gfx908__)
+        CVecType c_vec{0.f};
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_4x4x2bf16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
 #else
         ignore = a_vec;
         ignore = b_vec;
@@ -724,8 +879,20 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4
         DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4bf16_1k", Ctrl)
         else
         {
-#if defined(__gfx9__)
+#if defined(__gfx90a__) || defined(__gfx94__)
             c_vec = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+#elif defined(__gfx908__)
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_4x4x2bf16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
 #else
             ignore = c_vec;
             ignore = a_vec;
@@ -737,9 +904,23 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-#if defined(__gfx9__)
+#if defined(__gfx90a__) || defined(__gfx94__)
         return bit_cast<CVecType>(
             __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#elif defined(__gfx908__)
+        CVecType c_vec{0.f};
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_4x4x2bf16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
 #else
         ignore = a_vec;
         ignore = b_vec;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
index 7a3190e6f4..86bae7655b 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
@@ -70,9 +70,9 @@ struct WmmaTraitsBase<gfx12_t, ADType, BDType, CDType>
     static constexpr index_t kRepeat      = 1;
     static constexpr index_t kAMLane      = 16;
     static constexpr index_t kBNLane      = 16;
-    static constexpr index_t kABK0PerLane = 2;
+    static constexpr index_t kABK0PerLane = 1;
     static constexpr index_t kABKLane     = 2;
-    static constexpr index_t kABK1PerLane = 4;
+    static constexpr index_t kABK1PerLane = 8;
 
     static constexpr index_t kCMLane     = 2;
     static constexpr index_t kCNLane     = 16;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 5021fb9907..924f7c4a54 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -23,6 +23,11 @@ template <typename AType,
 struct WarpGemmDispatcher;
 
 // clang-format off
+// fp32
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
+template<> struct WarpGemmDispatcher<float, float, float, 16, 16,  4, false> { using Type = WarpGemmMfmaF32F32F32M16N16K4; };
+template<> struct WarpGemmDispatcher<float, float, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF32F32F32M16N16K16<>; };
+template<> struct WarpGemmDispatcher<float, float, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF32F32F32M16N16K16TransposedCDistribution<>; };
 // fp16
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
@@ -112,6 +117,10 @@ template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16,
 template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8<>; };
 template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<>; };
 template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, true> { using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8_CTransposed<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, true> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8_CTransposed<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, true> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8_CTransposed<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, true> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8_CTransposed<>; };
 
 template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<>; };
 template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<>; };
@@ -134,6 +143,7 @@ template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16,
     using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<WGAttrNumAccessEnum::Quad>; };
 template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<WGAttrNumAccessEnum::Quad>; };
+
 //WMMA cases
 template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_f8_f8<TransposeC>; };
 template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_bf8_bf8<TransposeC>; };
diff --git a/include/ck_tile/ops/gemm_group_quant.hpp b/include/ck_tile/ops/gemm_group_quant.hpp
deleted file mode 100644
index 92a53dd5ea..0000000000
--- a/include/ck_tile/ops/gemm_group_quant.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp"
-#include "ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp"
-#include "ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp"
-#include "ck_tile/ops/gemm_group_quant/kernel/gemm_bquant_kernel.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_quant_pipeline_problem.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_quant_traits.hpp"
-#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
-#include "ck_tile/ops/common/tensor_layout.hpp"
-#include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp b/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
deleted file mode 100644
index 69acb668ed..0000000000
--- a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
+++ /dev/null
@@ -1,753 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <string>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/core/algorithm/coordinate_transform.hpp"
-#include "ck_tile/core/arch/arch.hpp"
-#include "ck_tile/core/container/tuple.hpp"
-#include "ck_tile/core/numeric/integer.hpp"
-#include "ck_tile/core/numeric/math.hpp"
-#include "ck_tile/host/concat.hpp"
-
-namespace ck_tile {
-
-struct AQuantGemmProblem
-{
-    CK_TILE_HOST AQuantGemmProblem() = default;
-    CK_TILE_HOST AQuantGemmProblem(index_t M_,
-                                   index_t N_,
-                                   index_t K_,
-                                   index_t QK_,
-                                   index_t stride_A_,
-                                   index_t stride_B_,
-                                   index_t stride_C_,
-                                   index_t stride_AQ_)
-        : M(M_),
-          N(N_),
-          K(K_),
-          QK(QK_),
-          stride_A(stride_A_),
-          stride_B(stride_B_),
-          stride_C(stride_C_),
-          stride_AQ(stride_AQ_)
-    {
-    }
-
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t QK;
-    index_t stride_A;
-    index_t stride_B;
-    index_t stride_C;
-    index_t stride_AQ;
-};
-
-struct AQuantGemmHostArgs : public AQuantGemmProblem
-{
-    CK_TILE_HOST AQuantGemmHostArgs() = default;
-    CK_TILE_HOST AQuantGemmHostArgs(const void* a_ptr_,
-                                    const void* b_ptr_,
-                                    void* c_ptr_,
-                                    const void* aq_ptr_,
-                                    index_t k_batch_,
-                                    index_t M_,
-                                    index_t N_,
-                                    index_t K_,
-                                    index_t QK_,
-                                    index_t stride_A_,
-                                    index_t stride_B_,
-                                    index_t stride_C_,
-                                    index_t stride_AQ_)
-        : AQuantGemmProblem(M_, N_, K_, QK_, stride_A_, stride_B_, stride_C_, stride_AQ_),
-          a_ptr(a_ptr_),
-          b_ptr(b_ptr_),
-          aq_ptr(aq_ptr_),
-          c_ptr(c_ptr_),
-          k_batch(k_batch_)
-    {
-    }
-
-    const void* a_ptr;
-    const void* b_ptr;
-    const void* aq_ptr;
-    void* c_ptr;
-    index_t k_batch;
-};
-
-struct AQuantGemmKernelArgs
-{
-    const void* a_ptr;
-    const void* b_ptr;
-    const void* aq_ptr;
-    void* c_ptr;
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t QK;
-    index_t stride_A;
-    index_t stride_B;
-    index_t stride_C;
-    index_t stride_AQ;
-    index_t k_batch;
-};
-
-template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
-struct AQuantGemmKernel
-{
-    using TilePartitioner                 = remove_cvref_t<TilePartitioner_>;
-    using GemmPipeline                    = remove_cvref_t<GemmPipeline_>;
-    using EpiloguePipeline                = remove_cvref_t<EpiloguePipeline_>;
-    using ALayout                         = remove_cvref_t<typename GemmPipeline::ALayout>;
-    using AQLayout                        = remove_cvref_t<typename GemmPipeline::AQLayout>;
-    using BLayout                         = remove_cvref_t<typename GemmPipeline::BLayout>;
-    using CLayout                         = remove_cvref_t<typename GemmPipeline::CLayout>;
-    static constexpr index_t kBlockSize   = GemmPipeline::BlockSize;
-    static constexpr bool PreshuffleQuant = GemmPipeline::PreshuffleQuant;
-
-    using ADataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
-    using AQDataType = remove_cvref_t<typename GemmPipeline::AQDataType>;
-    using BDataType  = remove_cvref_t<typename GemmPipeline::BDataType>;
-    using CDataType  = remove_cvref_t<typename EpiloguePipeline::ODataType>;
-
-    static constexpr auto I0 = number<0>();
-    static constexpr auto I1 = number<1>();
-    static constexpr auto I2 = number<2>();
-    static constexpr auto I3 = number<3>();
-
-    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
-    {
-        // clang-format off
-        return concat('_', "gemm", gemm_prec_str<ADataType, BDataType>, GemmPipeline::GetName());
-        // clang-format on
-    }
-
-    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
-    {
-        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
-    }
-
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
-
-    CK_TILE_HOST static constexpr AQuantGemmKernelArgs
-    MakeKernelArgs(const AQuantGemmHostArgs& hostArgs)
-    {
-        return AQuantGemmKernelArgs{hostArgs.a_ptr,
-                                    hostArgs.b_ptr,
-                                    hostArgs.aq_ptr,
-                                    hostArgs.c_ptr,
-                                    hostArgs.M,
-                                    hostArgs.N,
-                                    hostArgs.K,
-                                    hostArgs.QK,
-                                    hostArgs.stride_A,
-                                    hostArgs.stride_B,
-                                    hostArgs.stride_C,
-                                    hostArgs.stride_AQ,
-                                    hostArgs.k_batch};
-    }
-
-    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
-    {
-        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
-    }
-
-    struct SplitKBatchOffset
-    {
-        __device__ SplitKBatchOffset(const AQuantGemmKernelArgs& kargs,
-                                     const std::size_t k_id = blockIdx.z)
-        {
-            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(I2);
-            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
-            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
-
-            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
-            }
-            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_A);
-            }
-
-            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
-            {
-                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_B);
-            }
-            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
-            {
-                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
-            }
-
-            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
-            {
-                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
-            }
-            else
-            {
-                splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
-            }
-        }
-
-        index_t a_k_split_offset;
-        index_t b_k_split_offset;
-        index_t splitted_k;
-    };
-
-    CK_TILE_HOST static bool IsSupportedArgument(const AQuantGemmKernelArgs& kargs)
-    {
-        if(kargs.k_batch != 1)
-        {
-            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-            {
-                CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
-            }
-            return false;
-        }
-
-        static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-        if(kargs.QK % GemmPipeline::GetVectorSizeAQ() != 0)
-        {
-            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-            {
-                CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
-            }
-            return false;
-        }
-
-        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-        {
-            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
-               GemmPipeline::kPadK == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
-                                  "without padding!");
-                }
-                return false;
-            }
-            if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support M that is not a multiple of MPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("M is not a multiple of vector load size for A tensor!");
-                }
-                return false;
-            }
-        }
-
-        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
-        {
-            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support N that is not a multiple of NPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("N is not a multiple of vector load size for B tensor!");
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
-               GemmPipeline::kPadK == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
-                                  "without padding!");
-                }
-                return false;
-            }
-            if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("K is not a multiple of vector load size for B tensor!");
-                }
-                return false;
-            }
-        }
-
-        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-        {
-            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support N that is not a multiple of NPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("N is not a multiple of vector load size for C tensor!");
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support M that is not a multiple of MPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("M is not a multiple of vector load size for C tensor!");
-                }
-                return false;
-            }
-        }
-        return true;
-    }
-
-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
-    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
-                                                   const BDataType* b_ptr,
-                                                   const AQDataType* aq_ptr,
-                                                   CDataType* c_ptr,
-                                                   const AQuantGemmKernelArgs& kargs,
-                                                   const SplitKBatchOffset& splitk_batch_offset)
-    {
-        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
-        const auto& a_tensor_view = [&]() {
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    a_ptr,
-                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
-                    make_tuple(kargs.stride_A, 1),
-                    number<GemmPipeline::GetVectorSizeA()>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    a_ptr,
-                    make_tuple(splitk_batch_offset.splitted_k, kargs.M),
-                    make_tuple(kargs.stride_A, 1),
-                    number<GemmPipeline::GetVectorSizeA()>{},
-                    number<1>{});
-            }
-        }();
-
-        const auto get_padding_size = [](index_t length, index_t alignment) {
-            return ck_tile::integer_least_multiple(length, alignment) - length;
-        };
-
-        const auto& make_preshuffled_aq_tensor_view = [&]() {
-            const auto aq_x = kargs.M * GemmPipeline::KPerBlockAQ;
-            const auto aq_y = kargs.QK / GemmPipeline::KPerBlockAQ;
-
-            const auto aq_desc =
-                make_naive_tensor_descriptor(make_tuple(aq_y, aq_x),
-                                             make_tuple(aq_x, 1),
-                                             number<GemmPipeline::GetVectorSizeAQ()>{},
-                                             number<1>{});
-
-            const auto block_tile_size = GemmPipeline::MPerBlock * GemmPipeline::KPerBlockAQ;
-            const auto aq_pad0_desc    = transform_tensor_descriptor(
-                aq_desc,
-                make_tuple(make_pass_through_transform(aq_y),
-                           make_right_pad_transform(aq_x, get_padding_size(aq_x, block_tile_size))),
-                make_tuple(sequence<0>{}, sequence<1>{}),
-                make_tuple(sequence<0>{}, sequence<1>{}));
-
-            const auto pad_aq_x = aq_pad0_desc.get_lengths()[I1];
-            const auto wave_tile_size =
-                TilePartitioner::BlockGemmShape::WarpTile::at(I0) * GemmPipeline::KPerBlockAQ;
-            const auto wave_tile_count_x = ck_tile::integer_divide_ceil(pad_aq_x, wave_tile_size);
-            const auto aq_unmerge_pad0_desc = transform_tensor_descriptor(
-                aq_pad0_desc,
-                make_tuple(make_pass_through_transform(aq_y),
-                           make_unmerge_transform(make_tuple(wave_tile_count_x, wave_tile_size))),
-                make_tuple(sequence<0>{}, sequence<1>{}),
-                make_tuple(sequence<0>{}, sequence<1, 2>{}));
-
-            const auto aq_pad1_desc = transform_tensor_descriptor(
-                aq_unmerge_pad0_desc,
-                make_tuple(make_pass_through_transform(aq_y),
-                           make_pass_through_transform(wave_tile_count_x),
-                           make_right_pad_transform(
-                               wave_tile_size, get_padding_size(wave_tile_size, get_warp_size()))),
-                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
-
-            const auto pad_wave_size =
-                ck_tile::integer_least_multiple(wave_tile_size, get_warp_size());
-            const auto aq_merge_pad1_desc = transform_tensor_descriptor(
-                aq_pad1_desc,
-                make_tuple(make_merge_transform(make_tuple(aq_y, wave_tile_count_x)),
-                           make_pass_through_transform(pad_wave_size)),
-                make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                make_tuple(sequence<0>{}, sequence<1>{}));
-
-            return make_tensor_view<address_space_enum::global>(aq_ptr, aq_merge_pad1_desc);
-        };
-
-        const auto& aq_tensor_view = [&]() {
-            static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-            if constexpr(PreshuffleQuant)
-            {
-                return make_preshuffled_aq_tensor_view();
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    aq_ptr,
-                    make_tuple(kargs.M, kargs.QK),
-                    make_tuple(kargs.stride_AQ, 1),
-                    number<GemmPipeline::GetVectorSizeAQ()>{},
-                    number<1>{});
-            }
-        }();
-
-        const auto& b_tensor_view = [&]() {
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
-            {
-                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
-                {
-                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
-                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
-                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
-                    const auto b_k0_n_k1_desc =
-                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
-                                                     make_tuple(kargs.N * K1, K1, I1),
-                                                     number<VectorSizeB>{},
-                                                     number<1>{});
-                    const auto b_n_k_desc = transform_tensor_descriptor(
-                        b_k0_n_k1_desc,
-                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                                   make_pass_through_transform(kargs.N)),
-                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
-                }
-                else
-                {
-                    return make_naive_tensor_view<address_space_enum::global>(
-                        b_ptr,
-                        make_tuple(splitk_batch_offset.splitted_k, kargs.N),
-                        make_tuple(kargs.stride_B, 1),
-                        number<GemmPipeline::GetVectorSizeB()>{},
-                        number<1>{});
-                }
-            }
-            else
-            {
-                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
-                {
-                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
-                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
-                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
-                    const auto b_k0_n_k1_desc =
-                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
-                                                     make_tuple(kargs.N * K1, K1, I1),
-                                                     number<VectorSizeB>{},
-                                                     number<1>{});
-                    const auto b_n_k_desc = transform_tensor_descriptor(
-                        b_k0_n_k1_desc,
-                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                                   make_pass_through_transform(kargs.N)),
-                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
-                        make_tuple(sequence<1>{}, sequence<0>{}));
-                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
-                }
-                else
-                {
-                    return make_naive_tensor_view<address_space_enum::global>(
-                        b_ptr,
-                        make_tuple(kargs.N, splitk_batch_offset.splitted_k),
-                        make_tuple(kargs.stride_B, 1),
-                        number<GemmPipeline::GetVectorSizeB()>{},
-                        number<1>{});
-                }
-            }
-        }();
-
-        // TODO: enable vector write for C in ColMajor
-        const auto& c_tensor_view = [&]() {
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
-                    c_ptr,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(kargs.stride_C, 1),
-                    number<EpiloguePipeline::GetVectorSizeC()>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
-                    c_ptr,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(1, kargs.stride_C),
-                    number<1>{},
-                    number<1>{});
-            }
-        }();
-
-        return make_tuple(a_tensor_view, aq_tensor_view, b_tensor_view, c_tensor_view);
-    }
-
-    template <typename TensorView>
-    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
-    {
-        const auto& a_pad_view = [&]() {
-            const auto& a_tensor_view = views.at(I0);
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return pad_tensor_view(a_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::KPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadK>{});
-            }
-            else
-            {
-                return pad_tensor_view(a_tensor_view,
-                                       make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                  number<TilePartitioner::MPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadM>{});
-            }
-        }();
-
-        const auto& aq_pad_view = [&]() { return views.at(I1); }();
-
-        const auto& b_pad_view = [&]() {
-            const auto& b_tensor_view = views.at(I2);
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
-            {
-                return pad_tensor_view(b_tensor_view,
-                                       make_tuple(number<TilePartitioner::NPerBlock>{},
-                                                  number<TilePartitioner::KPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadK>{});
-            }
-            else
-            {
-                return pad_tensor_view(b_tensor_view,
-                                       make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadN>{});
-            }
-        }();
-
-        // TODO vector write in for C in ColMajor
-        const auto& c_pad_view = [&]() {
-            const auto& c_tensor_view = views.at(I3);
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-            {
-                return pad_tensor_view(c_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadN>{});
-            }
-            else
-            {
-                return pad_tensor_view(c_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<GemmPipeline::kPadM, false>{});
-            }
-        }();
-
-        return make_tuple(a_pad_view, aq_pad_view, b_pad_view, c_pad_view);
-    }
-
-    template <typename PadView>
-    CK_TILE_DEVICE static auto
-    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
-    {
-        const auto& a_pad_view  = views.at(I0);
-        const auto& aq_pad_view = views.at(I1);
-        const auto& b_pad_view  = views.at(I2);
-        const auto& c_pad_view  = views.at(I3);
-
-        const auto& a_block_window = [&]() {
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_tile_window(a_pad_view,
-                                        make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                   number<TilePartitioner::KPerBlock>{}),
-                                        {i_m, 0});
-            }
-            else
-            {
-                return make_tile_window(a_pad_view,
-                                        make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                   number<TilePartitioner::MPerBlock>{}),
-                                        {0, i_m});
-            }
-        }();
-
-        const auto& aq_block_window = [&]() {
-            static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-            constexpr auto block_m = TilePartitioner::MPerBlock;
-            constexpr auto block_k = TilePartitioner::KPerBlock;
-            constexpr auto warp_m  = TilePartitioner::BlockGemmShape::WarpTile::at(I0);
-            constexpr auto aqk_per_block =
-                TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize;
-            if constexpr(PreshuffleQuant)
-            {
-                constexpr auto tile_window_width =
-                    ck_tile::integer_least_multiple(warp_m * aqk_per_block, get_warp_size());
-                constexpr auto tile_window_height = block_m / warp_m;
-                auto block_m_idx                  = i_m / block_m;
-                return make_tile_window(
-                    aq_pad_view,
-                    make_tuple(number<tile_window_height>{}, number<tile_window_width>{}),
-                    {block_m_idx * tile_window_height, 0});
-            }
-            else
-            {
-                return make_tile_window(
-                    aq_pad_view,
-                    make_tuple(number<block_m>{}, number<block_k / GemmPipeline::QuantGroupSize>{}),
-                    {i_m, 0});
-            }
-        }();
-
-        const auto& b_block_window = [&]() {
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
-            {
-                return make_tile_window(b_pad_view,
-                                        make_tuple(number<TilePartitioner::NPerBlock>{},
-                                                   number<TilePartitioner::KPerBlock>{}),
-                                        {i_n, 0});
-            }
-            else
-            {
-                return make_tile_window(b_pad_view,
-                                        make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                   number<TilePartitioner::NPerBlock>{}),
-                                        {0, i_n});
-            }
-        }();
-
-        auto c_block_window = make_tile_window(
-            c_pad_view,
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            {i_m, i_n});
-
-        return make_tuple(a_block_window, aq_block_window, b_block_window, c_block_window);
-    }
-
-    /**
-     * @brief Runs single GEMM problem cooperatively by whole workgroup.
-     *
-     * @param a_ptr input A pointer
-     * @param b_ptr input B pointer
-     * @param aq_ptr input AQ pointer
-     * @param c_ptr output C pointer
-     * @param smem_ptr_0 The start memory pointer of the shared memory block.
-     * @param kargs GEMM kernel arguments
-     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
-     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
-     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
-     *
-     * @tparam DstInMemOp Destination memory operation (default: set).
-     */
-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
-    CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
-                                       const BDataType* b_ptr,
-                                       const AQDataType* aq_ptr,
-                                       CDataType* c_ptr,
-                                       void* smem_ptr_0,
-                                       const AQuantGemmKernelArgs& kargs,
-                                       const SplitKBatchOffset& splitk_batch_offset,
-                                       const index_t block_idx_m,
-                                       const index_t block_idx_n)
-    {
-        // Create Gemm tensor views, pad views and tile windows
-        const auto& gemm_tensor_views_tuple = MakeGemmTensorViews<DstInMemOp>(
-            a_ptr, b_ptr, aq_ptr, c_ptr, kargs, splitk_batch_offset);
-
-        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
-
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
-
-        // Run GEMM cooperatively by whole workgroup.
-        const auto& a_block_window  = gemm_tile_windows.at(I0);
-        const auto& aq_block_window = gemm_tile_windows.at(I1);
-        const auto& b_block_window  = gemm_tile_windows.at(I2);
-
-        const auto& c_block_tile = GemmPipeline{}.template operator()(
-            a_block_window, b_block_window, aq_block_window, kargs.M, num_loop, smem_ptr_0);
-
-        // Run Epilogue Pipeline
-        auto& c_block_window = gemm_tile_windows.at(I3);
-
-        EpiloguePipeline{}.template
-        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(c_block_window)>(
-            c_block_window, c_block_tile, c_block_window, smem_ptr_0);
-    }
-
-    CK_TILE_DEVICE void operator()(AQuantGemmKernelArgs kargs) const
-    {
-        const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
-        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
-
-        const SplitKBatchOffset splitk_batch_offset(kargs);
-        // options
-        const ADataType* a_ptr   = static_cast<const ADataType*>(kargs.a_ptr);
-        const BDataType* b_ptr   = static_cast<const BDataType*>(kargs.b_ptr);
-        const AQDataType* aq_ptr = static_cast<const AQDataType*>(kargs.aq_ptr);
-        CDataType* c_ptr         = static_cast<CDataType*>(kargs.c_ptr);
-
-        // allocate LDS
-        __shared__ char smem_ptr_0[GetSmemSize()];
-
-        assert(kargs.k_batch == 1);
-        RunGemm(a_ptr, b_ptr, aq_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
-    }
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_bquant_kernel.hpp b/include/ck_tile/ops/gemm_group_quant/kernel/gemm_bquant_kernel.hpp
deleted file mode 100644
index 24e69d2628..0000000000
--- a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_bquant_kernel.hpp
+++ /dev/null
@@ -1,679 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <string>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
-#include "ck_tile/host/concat.hpp"
-
-namespace ck_tile {
-
-struct BQuantGemmProblem
-{
-    CK_TILE_HOST BQuantGemmProblem() = default;
-    CK_TILE_HOST BQuantGemmProblem(index_t M_,
-                                   index_t N_,
-                                   index_t K_,
-                                   index_t QK_,
-                                   index_t stride_A_,
-                                   index_t stride_B_,
-                                   index_t stride_C_,
-                                   index_t stride_BQ_)
-        : M(M_),
-          N(N_),
-          K(K_),
-          QK(QK_),
-          stride_A(stride_A_),
-          stride_B(stride_B_),
-          stride_C(stride_C_),
-          stride_BQ(stride_BQ_)
-    {
-    }
-
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t QK;
-    index_t stride_A;
-    index_t stride_B;
-    index_t stride_C;
-    index_t stride_BQ;
-};
-
-struct BQuantGemmHostArgs : public BQuantGemmProblem
-{
-    CK_TILE_HOST BQuantGemmHostArgs() = default;
-    CK_TILE_HOST BQuantGemmHostArgs(const void* a_ptr_,
-                                    const void* b_ptr_,
-                                    void* c_ptr_,
-                                    const void* bq_ptr_,
-                                    index_t k_batch_,
-                                    index_t M_,
-                                    index_t N_,
-                                    index_t K_,
-                                    index_t QK_,
-                                    index_t stride_A_,
-                                    index_t stride_B_,
-                                    index_t stride_C_,
-                                    index_t stride_BQ_)
-        : BQuantGemmProblem(M_, N_, K_, QK_, stride_A_, stride_B_, stride_C_, stride_BQ_),
-          a_ptr(a_ptr_),
-          b_ptr(b_ptr_),
-          bq_ptr(bq_ptr_),
-          c_ptr(c_ptr_),
-          k_batch(k_batch_)
-    {
-    }
-
-    const void* a_ptr;
-    const void* b_ptr;
-    const void* bq_ptr;
-    void* c_ptr;
-    index_t k_batch;
-};
-
-struct BQuantGemmKernelArgs
-{
-    const void* a_ptr;
-    const void* b_ptr;
-    const void* bq_ptr;
-    void* c_ptr;
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t QK;
-    index_t stride_A;
-    index_t stride_B;
-    index_t stride_C;
-    index_t stride_BQ;
-    index_t k_batch;
-};
-
-template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
-struct BQuantGemmKernel
-{
-    using TilePartitioner               = remove_cvref_t<TilePartitioner_>;
-    using GemmPipeline                  = remove_cvref_t<GemmPipeline_>;
-    using EpiloguePipeline              = remove_cvref_t<EpiloguePipeline_>;
-    using ALayout                       = remove_cvref_t<typename GemmPipeline::ALayout>;
-    using BLayout                       = remove_cvref_t<typename GemmPipeline::BLayout>;
-    using BQLayout                      = remove_cvref_t<typename GemmPipeline::BQLayout>;
-    using CLayout                       = remove_cvref_t<typename GemmPipeline::CLayout>;
-    static constexpr index_t kBlockSize = GemmPipeline::BlockSize;
-
-    using ADataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
-    using BDataType  = remove_cvref_t<typename GemmPipeline::BDataType>;
-    using BQDataType = remove_cvref_t<typename GemmPipeline::BQDataType>;
-    using CDataType  = remove_cvref_t<typename EpiloguePipeline::ODataType>;
-
-    static constexpr auto I0 = number<0>();
-    static constexpr auto I1 = number<1>();
-    static constexpr auto I2 = number<2>();
-    static constexpr auto I3 = number<3>();
-
-    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
-    {
-        // clang-format off
-        return concat('_', "gemm", gemm_prec_str<ADataType, BDataType>, GemmPipeline::GetName());
-        // clang-format on
-    }
-
-    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
-    {
-        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
-    }
-
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
-
-    CK_TILE_HOST static constexpr BQuantGemmKernelArgs
-    MakeKernelArgs(const BQuantGemmHostArgs& hostArgs)
-    {
-        return BQuantGemmKernelArgs{hostArgs.a_ptr,
-                                    hostArgs.b_ptr,
-                                    hostArgs.bq_ptr,
-                                    hostArgs.c_ptr,
-                                    hostArgs.M,
-                                    hostArgs.N,
-                                    hostArgs.K,
-                                    hostArgs.QK,
-                                    hostArgs.stride_A,
-                                    hostArgs.stride_B,
-                                    hostArgs.stride_C,
-                                    hostArgs.stride_BQ,
-                                    hostArgs.k_batch};
-    }
-
-    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
-    {
-        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
-    }
-
-    struct SplitKBatchOffset
-    {
-        __device__ SplitKBatchOffset(const BQuantGemmKernelArgs& kargs,
-                                     const std::size_t k_id = blockIdx.z)
-        {
-            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
-            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
-            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
-
-            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
-            }
-            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_A);
-            }
-
-            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
-            {
-                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_B);
-            }
-            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
-            {
-                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
-            }
-
-            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
-            {
-                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
-            }
-            else
-            {
-                splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
-            }
-        }
-
-        index_t a_k_split_offset;
-        index_t b_k_split_offset;
-        index_t splitted_k;
-    };
-
-    CK_TILE_HOST static bool IsSupportedArgument(const BQuantGemmKernelArgs& kargs)
-    {
-        if(kargs.k_batch != 1)
-        {
-            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-            {
-                CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
-            }
-            return false;
-        }
-
-        static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-        if(kargs.QK % GemmPipeline::GetVectorSizeBQ() != 0)
-        {
-            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-            {
-                CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
-            }
-            return false;
-        }
-
-        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-        {
-            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
-               GemmPipeline::kPadK == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
-                                  "without padding!");
-                }
-                return false;
-            }
-            if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support M that is not a multiple of MPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("M is not a multiple of vector load size for A tensor!");
-                }
-                return false;
-            }
-        }
-
-        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
-        {
-            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support N that is not a multiple of NPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("N is not a multiple of vector load size for B tensor!");
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
-               GemmPipeline::kPadK == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
-                                  "without padding!");
-                }
-                return false;
-            }
-            if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("K is not a multiple of vector load size for B tensor!");
-                }
-                return false;
-            }
-        }
-
-        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-        {
-            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support N that is not a multiple of NPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("N is not a multiple of vector load size for C tensor!");
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support M that is not a multiple of MPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("M is not a multiple of vector load size for C tensor!");
-                }
-                return false;
-            }
-        }
-        return true;
-    }
-
-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
-    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
-                                                   const BDataType* b_ptr,
-                                                   const BQDataType* bq_ptr,
-                                                   CDataType* c_ptr,
-                                                   const BQuantGemmKernelArgs& kargs,
-                                                   const SplitKBatchOffset& splitk_batch_offset)
-    {
-        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
-        const auto& a_tensor_view = [&]() {
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    a_ptr,
-                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
-                    make_tuple(kargs.stride_A, 1),
-                    number<GemmPipeline::GetVectorSizeA()>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    a_ptr,
-                    make_tuple(splitk_batch_offset.splitted_k, kargs.M),
-                    make_tuple(kargs.stride_A, 1),
-                    number<GemmPipeline::GetVectorSizeA()>{},
-                    number<1>{});
-            }
-        }();
-
-        const auto& bq_tensor_view = [&]() {
-            static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-            return make_naive_tensor_view<address_space_enum::global>(
-                bq_ptr,
-                make_tuple(kargs.N, kargs.QK),
-                make_tuple(kargs.stride_BQ, 1),
-                number<GemmPipeline::GetVectorSizeBQ()>{},
-                number<1>{});
-        }();
-
-        const auto& b_tensor_view = [&]() {
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
-            {
-                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
-                {
-                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
-                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
-                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
-                    const auto b_k0_n_k1_desc =
-                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
-                                                     make_tuple(kargs.N * K1, K1, I1),
-                                                     number<VectorSizeB>{},
-                                                     number<1>{});
-                    const auto b_n_k_desc = transform_tensor_descriptor(
-                        b_k0_n_k1_desc,
-                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                                   make_pass_through_transform(kargs.N)),
-                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
-                }
-                else
-                {
-                    return make_naive_tensor_view<address_space_enum::global>(
-                        b_ptr,
-                        make_tuple(splitk_batch_offset.splitted_k, kargs.N),
-                        make_tuple(kargs.stride_B, 1),
-                        number<GemmPipeline::GetVectorSizeB()>{},
-                        number<1>{});
-                }
-            }
-            else
-            {
-                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
-                {
-                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
-                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
-                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
-                    const auto b_k0_n_k1_desc =
-                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
-                                                     make_tuple(kargs.N * K1, K1, I1),
-                                                     number<VectorSizeB>{},
-                                                     number<1>{});
-                    const auto b_n_k_desc = transform_tensor_descriptor(
-                        b_k0_n_k1_desc,
-                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                                   make_pass_through_transform(kargs.N)),
-                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
-                        make_tuple(sequence<1>{}, sequence<0>{}));
-                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
-                }
-                else
-                {
-                    return make_naive_tensor_view<address_space_enum::global>(
-                        b_ptr,
-                        make_tuple(kargs.N, splitk_batch_offset.splitted_k),
-                        make_tuple(kargs.stride_B, 1),
-                        number<GemmPipeline::GetVectorSizeB()>{},
-                        number<1>{});
-                }
-            }
-        }();
-
-        // TODO: enable vector write for C in ColMajor
-        const auto& c_tensor_view = [&]() {
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
-                    c_ptr,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(kargs.stride_C, 1),
-                    number<EpiloguePipeline::GetVectorSizeC()>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
-                    c_ptr,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(1, kargs.stride_C),
-                    number<1>{},
-                    number<1>{});
-            }
-        }();
-
-        return make_tuple(a_tensor_view, bq_tensor_view, b_tensor_view, c_tensor_view);
-    }
-
-    template <typename TensorView>
-    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
-    {
-        const auto& a_pad_view = [&]() {
-            const auto& a_tensor_view = views.at(I0);
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return pad_tensor_view(a_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::KPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadK>{});
-            }
-            else
-            {
-                return pad_tensor_view(a_tensor_view,
-                                       make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                  number<TilePartitioner::MPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadM>{});
-            }
-        }();
-
-        const auto& bq_pad_view = [&]() {
-            const auto& bq_tensor_view = views.at(I1);
-            static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-            return pad_tensor_view(
-                bq_tensor_view,
-                make_tuple(number<TilePartitioner::NPerBlock>{},
-                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
-                // TODO: Add support for padding.
-                sequence<false, false>{});
-        }();
-
-        const auto& b_pad_view = [&]() {
-            const auto& b_tensor_view = views.at(I2);
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
-            {
-                return pad_tensor_view(b_tensor_view,
-                                       make_tuple(number<TilePartitioner::NPerBlock>{},
-                                                  number<TilePartitioner::KPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadK>{});
-            }
-            else
-            {
-                return pad_tensor_view(b_tensor_view,
-                                       make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadN>{});
-            }
-        }();
-
-        // TODO vector write in for C in ColMajor
-        const auto& c_pad_view = [&]() {
-            const auto& c_tensor_view = views.at(I3);
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-            {
-                return pad_tensor_view(c_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadN>{});
-            }
-            else
-            {
-                return pad_tensor_view(c_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<GemmPipeline::kPadM, false>{});
-            }
-        }();
-
-        return make_tuple(a_pad_view, bq_pad_view, b_pad_view, c_pad_view);
-    }
-
-    template <typename PadView>
-    CK_TILE_DEVICE static auto
-    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
-    {
-        const auto& a_pad_view  = views.at(I0);
-        const auto& bq_pad_view = views.at(I1);
-        const auto& b_pad_view  = views.at(I2);
-        const auto& c_pad_view  = views.at(I3);
-
-        const auto& a_block_window = [&]() {
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_tile_window(a_pad_view,
-                                        make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                   number<TilePartitioner::KPerBlock>{}),
-                                        {i_m, 0});
-            }
-            else
-            {
-                return make_tile_window(a_pad_view,
-                                        make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                   number<TilePartitioner::MPerBlock>{}),
-                                        {0, i_m});
-            }
-        }();
-
-        const auto& bq_block_window = [&]() {
-            static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-            return make_tile_window(
-                bq_pad_view,
-                make_tuple(number<TilePartitioner::NPerBlock>{},
-                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
-                {i_n, 0});
-        }();
-
-        const auto& b_block_window = [&]() {
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
-            {
-                return make_tile_window(b_pad_view,
-                                        make_tuple(number<TilePartitioner::NPerBlock>{},
-                                                   number<TilePartitioner::KPerBlock>{}),
-                                        {i_n, 0});
-            }
-            else
-            {
-                return make_tile_window(b_pad_view,
-                                        make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                   number<TilePartitioner::NPerBlock>{}),
-                                        {0, i_n});
-            }
-        }();
-
-        auto c_block_window = make_tile_window(
-            c_pad_view,
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            {i_m, i_n});
-
-        return make_tuple(a_block_window, bq_block_window, b_block_window, c_block_window);
-    }
-
-    /**
-     * @brief Runs single GEMM problem cooperatively by whole workgroup.
-     *
-     * @param a_ptr input A pointer
-     * @param b_ptr input B pointer
-     * @param bq_ptr input BQ pointer
-     * @param c_ptr output C pointer
-     * @param smem_ptr_0 The start memory pointer of the shared memory block.
-     * @param kargs GEMM kernel arguments
-     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
-     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
-     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
-     *
-     * @tparam DstInMemOp Destination memory operation (default: set).
-     */
-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
-    CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
-                                       const BDataType* b_ptr,
-                                       const BQDataType* bq_ptr,
-                                       CDataType* c_ptr,
-                                       void* smem_ptr_0,
-                                       const BQuantGemmKernelArgs& kargs,
-                                       const SplitKBatchOffset& splitk_batch_offset,
-                                       const index_t block_idx_m,
-                                       const index_t block_idx_n)
-    {
-        // Create Gemm tensor views, pad views and tile windows
-        const auto& gemm_tensor_views_tuple = MakeGemmTensorViews<DstInMemOp>(
-            a_ptr, b_ptr, bq_ptr, c_ptr, kargs, splitk_batch_offset);
-
-        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
-
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
-
-        // Run GEMM cooperatively by whole workgroup.
-        const auto& a_block_window  = gemm_tile_windows.at(I0);
-        const auto& bq_block_window = gemm_tile_windows.at(I1);
-        const auto& b_block_window  = gemm_tile_windows.at(I2);
-
-        const auto& c_block_tile = GemmPipeline{}.template operator()(
-            a_block_window, b_block_window, bq_block_window, num_loop, smem_ptr_0);
-
-        // Run Epilogue Pipeline
-        auto& c_block_window = gemm_tile_windows.at(I3);
-
-        EpiloguePipeline{}.template
-        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(c_block_window)>(
-            c_block_window, c_block_tile, c_block_window, smem_ptr_0);
-    }
-
-    CK_TILE_DEVICE void operator()(BQuantGemmKernelArgs kargs) const
-    {
-        const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
-        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
-
-        const SplitKBatchOffset splitk_batch_offset(kargs);
-        // options
-        const ADataType* a_ptr   = static_cast<const ADataType*>(kargs.a_ptr);
-        const BDataType* b_ptr   = static_cast<const BDataType*>(kargs.b_ptr);
-        const BQDataType* bq_ptr = static_cast<const BQDataType*>(kargs.bq_ptr);
-        CDataType* c_ptr         = static_cast<CDataType*>(kargs.c_ptr);
-
-        // allocate LDS
-        __shared__ char smem_ptr_0[GetSmemSize()];
-
-        assert(kargs.k_batch == 1);
-        RunGemm(a_ptr, b_ptr, bq_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
-    }
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_quant_traits.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_quant_traits.hpp
deleted file mode 100644
index 05ce35ae59..0000000000
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_quant_traits.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-
-template <bool kPadM_,
-          bool kPadN_,
-          bool kPadK_,
-          bool PreshuffleQuant_,
-          typename ALayout_,
-          typename BLayout_,
-          typename CLayout_,
-          typename AQLayout_ = ALayout_>
-struct TileGemmAQuantTraits
-{
-    static constexpr bool kPadM = kPadM_;
-    static constexpr bool kPadN = kPadN_;
-    static constexpr bool kPadK = kPadK_;
-
-    static constexpr int _VectorSize = 16;
-
-    using ALayout  = ALayout_;
-    using BLayout  = BLayout_;
-    using CLayout  = CLayout_;
-    using AQLayout = AQLayout_;
-
-    static constexpr bool TransposeC            = false;
-    static constexpr bool UseStructuredSparsity = false;
-    static constexpr index_t NumWaveGroups      = 1;
-
-    static constexpr bool PreshuffleQuant = PreshuffleQuant_;
-};
-
-template <bool kPadM_,
-          bool kPadN_,
-          bool kPadK_,
-          bool PreshuffleQuant_,
-          typename ALayout_,
-          typename BLayout_,
-          typename CLayout_,
-          typename BQLayout_ = BLayout_>
-struct TileGemmBQuantTraits
-{
-    static constexpr bool kPadM = kPadM_;
-    static constexpr bool kPadN = kPadN_;
-    static constexpr bool kPadK = kPadK_;
-
-    static constexpr int _VectorSize = 16;
-
-    using ALayout  = ALayout_;
-    using BLayout  = BLayout_;
-    using CLayout  = CLayout_;
-    using BQLayout = BQLayout_;
-
-    static constexpr bool TransposeC            = false;
-    static constexpr bool UseStructuredSparsity = false;
-    static constexpr index_t NumWaveGroups      = 1;
-    static constexpr bool PreshuffleQuant       = PreshuffleQuant_;
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_quant.hpp b/include/ck_tile/ops/gemm_quant.hpp
new file mode 100644
index 0000000000..49a27850c5
--- /dev/null
+++ b/include/ck_tile/ops/gemm_quant.hpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp"
+#include "ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp"
+#include "ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp"
+#include "ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp"
+#include "ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_quant_pipeline_problem.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_base_policy.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
new file mode 100644
index 0000000000..c4c1f1bbf7
--- /dev/null
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp"
+
+namespace ck_tile {
+
+// A is block window on shared memory
+// BQ (scale tensor) is block distributed tensor.
+// Consecutive kQuantGroupSize elements of B are quantized with a separate scale.
+// B is block window on block distributed tensor.
+// C is block distributed tensor
+template <typename Problem_, typename BlockPolicy_>
+struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
+{
+    using Problem         = remove_cvref_t<Problem_>;
+    using BlockPolicy     = remove_cvref_t<BlockPolicy_>;
+    using ADataType       = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType       = remove_cvref_t<typename Problem::BDataType>;
+    using BQDataType      = remove_cvref_t<typename Problem::BQDataType>;
+    using CDataType       = remove_cvref_t<typename Problem::CDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using BlockGemmShape  = remove_cvref_t<typename Problem::BlockGemmShape>; // TileFlatmmShape
+
+    static constexpr auto I0   = number<0>();
+    static constexpr auto I1   = number<1>();
+    static constexpr auto I2   = number<2>();
+    static constexpr auto idxM = I0;
+    static constexpr auto idxN = I1;
+    static constexpr auto idxK = I2;
+    using BlockTile            = remove_cvref_t<typename BlockGemmShape::BlockTile>;
+    using BlockWarps           = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
+    using WarpTile             = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+
+    static constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+    static constexpr auto warp_size = get_warp_size();
+
+    using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+    static constexpr index_t MWarp = config.template at<1>();
+    static constexpr index_t NWarp = config.template at<2>();
+
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t kQuantGroupSize = Problem::kQuantGroupSize;
+    static constexpr index_t kBlockSize      = Problem::kBlockSize;
+
+    static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+    static constexpr index_t NIterPerWarp =
+        BlockTile::at(idxN) / (WarpTile::at(idxN) * BlockWarps::at(idxN));
+    static constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+    static constexpr auto MIter_2nd_last =
+        (MIterPerWarp >= 2) ? MIterPerWarp - 2 : MIterPerWarp - 1;
+
+    static constexpr index_t KPerBlockBQ = KPerBlock / kQuantGroupSize;
+
+    static constexpr index_t QScalesPerBlockRow =
+        (KPerBlock + kQuantGroupSize - 1) / kQuantGroupSize;
+
+    static constexpr index_t QScalesPerWarpGemmRow =
+        (WG::kK + kQuantGroupSize - 1) / kQuantGroupSize;
+
+    static constexpr index_t KIterPerQScale = KIterPerWarp / QScalesPerBlockRow;
+    static constexpr index_t DsReadPreload  = 2; // default 2, preload 2 ds read
+
+    static constexpr index_t m_preload = (MIterPerWarp * KIterPerWarp >= DsReadPreload)
+                                             ? DsReadPreload
+                                             : MIterPerWarp * KIterPerWarp;
+
+    template <typename T>
+    CK_TILE_DEVICE static float cvt_scale_to_fp32(T& scale)
+    {
+        float scale_reg_f = 0.f;
+        if constexpr(std::is_same_v<BQDataType, ck_tile::fp8_t>)
+        {
+            scale_reg_f = element_wise::amd_assembly_fp8_to_fp32(static_cast<uint32_t>(scale));
+        }
+        else if constexpr(std::is_same_v<BQDataType, ck_tile::bf8_t>)
+        {
+            scale_reg_f = element_wise::amd_assembly_bf8_to_fp32(static_cast<uint32_t>(scale));
+        }
+        else if constexpr(std::is_same_v<BQDataType, float>)
+        {
+            scale_reg_f = ck_tile::bit_cast<float>(scale);
+        }
+        else
+        {
+            static_assert(false, "BQDataType must be float, fp8_t or bf8_t.");
+        }
+        return scale_reg_f;
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+
+        auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    // C += A * B
+    template <typename CBlockTensor,
+              typename ABlockTensor,
+              typename BFlatBlockTensor,
+              typename BQBlockTensor,
+              typename ABlockWindow>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   ABlockTensor& a_warp_tensor,
+                                   BFlatBlockTensor& b_warp_tensor,
+                                   BQBlockTensor& bq_block_tensor,
+                                   ABlockWindow& a_warp_windows) const
+    {
+        using CWarpDstr   = typename WG::CWarpDstr;
+        using CWarpTensor = typename WG::CWarpTensor;
+
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        static_for<0, QScalesPerBlockRow, 1>{}([&](auto kQScale) {
+            CWarpTensor c_warp_tensor;
+            static_for<0, KIterPerQScale, 1>{}([&](auto kIterInQScale) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        constexpr auto kIter = kQScale * KIterPerQScale + kIterInQScale;
+
+                        constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+
+                        // warp GEMM
+                        if constexpr(kIterInQScale == 0)
+                            c_warp_tensor = WG{}(a_warp_tensor(number<AwarpIter>{}),
+                                                 b_warp_tensor(nIter)(number<kIter>{}));
+                        else
+                            WG{}(c_warp_tensor,
+                                 a_warp_tensor(number<AwarpIter>{}),
+                                 b_warp_tensor(nIter)(number<kIter>{}));
+
+                        __builtin_amdgcn_sched_barrier(0x7F6);
+                        // preload next A from lds
+                        if constexpr((kIter * MIterPerWarp + mIter) <
+                                     (KIterPerWarp * MIterPerWarp - m_preload))
+                        {
+                            constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                            constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                            a_warp_tensor(number<AwarpIter>{}) =
+                                load_tile(a_warp_windows(number<AmIter>{})(number<AkIter>{}));
+                        }
+                        // barrier
+                        if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                        {
+                            block_sync_lds();
+                        }
+                    });
+                });
+            });
+
+            constexpr auto tbuf_offset =
+                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(merge_sequences(
+                           sequence<number<0>{}, number<0>{}>{}, c_warp_y_index_zeros)) /
+                       CBlockTensor::PackedSize>{};
+
+            constexpr index_t reg_offset = kQScale;
+            // nIter * KPerBlockBQ + kQScale; //((kIter * WG::kK) / kQuantGroupSize);
+
+            auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
+            float scale_reg_f = cvt_scale_to_fp32(scale_reg);
+
+            static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
+                c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                    (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
+            });
+        });
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
similarity index 94%
rename from include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
rename to include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
index 614245f05b..d4bece1a83 100644
--- a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -5,19 +5,19 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/elementwise.hpp"
 
 namespace ck_tile {
 
-template <typename Problem, index_t UnaryOpSize_ = 8>
+template <typename Problem>
 struct BlockGemmAQuantBase
 {
     using AQDataType      = remove_cvref_t<typename Problem::AQDataType>;
     using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
 
-    static constexpr index_t UnaryOpSize = UnaryOpSize_;
     template <typename T>
     CK_TILE_DEVICE static float cvt_scale_to_fp32(T scale)
     {
@@ -42,23 +42,6 @@ struct BlockGemmAQuantBase
         }
         return scale_reg_f;
     }
-
-    template <typename WarpWindow, typename WarpTile>
-    CK_TILE_DEVICE static void load_interleaved_pk_type(WarpTile& warp_tile,
-                                                        const WarpWindow& warp_window)
-    {
-        const element_wise::PassThroughPack8 elementwise_op{};
-
-        static_assert(WarpTile::get_thread_buffer_size() % UnaryOpSize == 0);
-        constexpr index_t thread_buffer_size = WarpTile::get_thread_buffer_size() / UnaryOpSize;
-        const auto in_dstr_tensors           = load_tile(warp_window);
-
-        using ComputeVectorType = ComputeDataType __attribute__((ext_vector_type(UnaryOpSize)));
-        static_for<0, thread_buffer_size, 1>{}([&](auto i) {
-            elementwise_op(warp_tile.get_thread_buffer().template get_as<ComputeVectorType>()(i),
-                           in_dstr_tensors.get_thread_buffer().template get_as<pk_int4x4_t>()[i]);
-        });
-    }
 };
 
 // A is block window on shared memory
@@ -66,7 +49,9 @@ struct BlockGemmAQuantBase
 // Consecutive kQuantGroupSize elements of A are quantized with a separate scale.
 // B is block window on shared memory
 // C is block distributed tensor
-template <typename Problem_, typename Policy_ = BlockGemmASmemBSmemCRegV1DefaultPolicy>
+template <typename Problem_,
+          typename Policy_     = BlockGemmASmemBSmemCRegV1DefaultPolicy,
+          index_t UnaryOpSize_ = 8>
 struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
 {
     private:
@@ -172,6 +157,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
 
     using Base = BlockGemmAQuantBase<Problem_>;
 
+    using Loader   = remove_cvref_t<InterleavedPKTypeLoader<ComputeDataType, UnaryOpSize_>>;
     using WarpGemm = remove_cvref_t<typename Traits::WarpGemm>;
 
     static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
@@ -181,9 +167,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
     static constexpr index_t MWarp = Traits::MWarp;
     static constexpr index_t NWarp = Traits::NWarp;
 
-    static constexpr auto Scheduler       = Traits::Scheduler;
-    static constexpr uint8_t kA_cvt_scale = std::is_same_v<ADataType, pk_int4_t> ? 16 : 1;
-    static constexpr uint8_t kB_cvt_scale = std::is_same_v<BDataType, pk_int4_t> ? 16 : 1;
+    static constexpr auto Scheduler = Traits::Scheduler;
 
     using AWarpDstr = typename WarpGemm::AWarpDstr;
     using BWarpDstr = typename WarpGemm::BWarpDstr;
@@ -294,7 +278,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
             {
                 static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
                               std::is_same_v<ComputeDataType, bf8_t>);
-                Base::load_interleaved_pk_type(a_warp_tile_, a_block_window);
+                Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
             }
             else
             {
@@ -304,7 +288,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
             {
                 static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
                               std::is_same_v<ComputeDataType, bf8_t>);
-                Base::load_interleaved_pk_type(b_warp_tile_, b_block_window);
+                Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
             }
             else
             {
@@ -363,7 +347,8 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
                             if constexpr(Traits::TransposeC) // transposed C
                             {
                                 static_assert(false,
-                                              "It is not supported yet to enable both Preshuffle.");
+                                              "It is not supported yet to enable both Preshuffle "
+                                              "and TransposeC.");
                                 // TODO:
                                 // A new tile distribution is needed for the Preshuffle and
                                 // Transpose combination. For instance, with mnk at 16x16x32, lanes
@@ -451,13 +436,13 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
 
                                         c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
                                             (c_warp_tensor.get_thread_buffer()[c_row] *
-                                             scale_reg_f * kA_cvt_scale * kB_cvt_scale);
+                                             scale_reg_f);
                                     });
                             }
                         }
                         else
                         {
-                            if(Traits::TransposeC) // transposed C
+                            if constexpr(Traits::TransposeC) // transposed C
                             {
                                 constexpr index_t reg_offset = mIter * Traits::AQPerBlock + kQScale;
                                 constexpr auto tbuf_offset   = number<
@@ -471,7 +456,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
                                     [&](auto c_row) {
                                         c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
                                             (c_warp_tensor.get_thread_buffer()[c_row] *
-                                             scale_reg_f * kA_cvt_scale * kB_cvt_scale);
+                                             scale_reg_f);
                                     });
                             }
                             else
@@ -556,7 +541,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
                                                                        reg_offset_for_row_data] +=
                                         (c_warp_tensor
                                              .get_thread_buffer()[reg_offset_for_row_data] *
-                                         scale_reg_f * kA_cvt_scale * kB_cvt_scale);
+                                         scale_reg_f);
                                 });
                             }
                         }
diff --git a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
similarity index 92%
rename from include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
rename to include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
index 844c8f6eb0..077d0d8fe2 100644
--- a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
@@ -5,19 +5,19 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/elementwise.hpp"
 
 namespace ck_tile {
 
-template <typename Problem, index_t UnaryOpSize_ = 8>
+template <typename Problem>
 struct BlockGemmBQuantBase
 {
     using BQDataType      = remove_cvref_t<typename Problem::BQDataType>;
     using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
 
-    static constexpr index_t UnaryOpSize = UnaryOpSize_;
     template <typename T>
     CK_TILE_DEVICE static float cvt_scale_to_fp32(T scale)
     {
@@ -42,24 +42,6 @@ struct BlockGemmBQuantBase
         }
         return scale_reg_f;
     }
-
-    // can be inherited from A
-    template <typename WarpWindow, typename WarpTile>
-    CK_TILE_DEVICE static void load_interleaved_pk_type(WarpTile& warp_tile,
-                                                        const WarpWindow& warp_window)
-    {
-        const element_wise::PassThroughPack8 elementwise_op{};
-
-        static_assert(WarpTile::get_thread_buffer_size() % UnaryOpSize == 0);
-        constexpr index_t thread_buffer_size = WarpTile::get_thread_buffer_size() / UnaryOpSize;
-        const auto in_dstr_tensors           = load_tile(warp_window);
-
-        using ComputeVectorType = ComputeDataType __attribute__((ext_vector_type(UnaryOpSize)));
-        static_for<0, thread_buffer_size, 1>{}([&](auto i) {
-            elementwise_op(warp_tile.get_thread_buffer().template get_as<ComputeVectorType>()(i),
-                           in_dstr_tensors.get_thread_buffer().template get_as<pk_int4x4_t>()[i]);
-        });
-    }
 };
 
 // A is block window on shared memory
@@ -67,7 +49,9 @@ struct BlockGemmBQuantBase
 // Consecutive kQuantGroupSize elements of B are quantized with a separate scale.
 // B is block window on shared memory
 // C is block distributed tensor
-template <typename Problem_, typename Policy_ = BlockGemmASmemBSmemCRegV1DefaultPolicy>
+template <typename Problem_,
+          typename Policy_     = BlockGemmASmemBSmemCRegV1DefaultPolicy,
+          index_t UnaryOpSize_ = 8>
 struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
 {
     private:
@@ -170,6 +154,7 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
 
     using Base = BlockGemmBQuantBase<Problem_>;
 
+    using Loader   = remove_cvref_t<InterleavedPKTypeLoader<ComputeDataType, UnaryOpSize_>>;
     using WarpGemm = remove_cvref_t<typename Traits::WarpGemm>;
 
     static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
@@ -179,9 +164,7 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
     static constexpr index_t MWarp = Traits::MWarp;
     static constexpr index_t NWarp = Traits::NWarp;
 
-    static constexpr auto Scheduler       = Traits::Scheduler;
-    static constexpr uint8_t kA_cvt_scale = std::is_same_v<ADataType, pk_int4_t> ? 16 : 1;
-    static constexpr uint8_t kB_cvt_scale = std::is_same_v<BDataType, pk_int4_t> ? 16 : 1;
+    static constexpr auto Scheduler = Traits::Scheduler;
 
     using AWarpDstr = typename WarpGemm::AWarpDstr;
     using BWarpDstr = typename WarpGemm::BWarpDstr;
@@ -293,7 +276,7 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
             {
                 static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
                               std::is_same_v<ComputeDataType, bf8_t>);
-                Base::load_interleaved_pk_type(a_warp_tile_, a_block_window);
+                Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
             }
             else
             {
@@ -303,7 +286,7 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
             {
                 static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
                               std::is_same_v<ComputeDataType, bf8_t>);
-                Base::load_interleaved_pk_type(b_warp_tile_, b_block_window);
+                Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
             }
             else
             {
@@ -384,8 +367,7 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
                         float scale_reg_f = Base::cvt_scale_to_fp32(scale_reg);
                         static_for<0, WarpGemm::kM / 2, 1>{}([&](auto c_row) {
                             c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
-                                (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f *
-                                 kA_cvt_scale * kB_cvt_scale);
+                                (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
                         });
                     });
                 });
diff --git a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
new file mode 100644
index 0000000000..a0b6fc5821
--- /dev/null
+++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
@@ -0,0 +1,1148 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp"
+
+namespace ck_tile {
+
+namespace detail {
+// Helper templates for safe type extraction
+template <typename, typename Default, typename = void>
+struct get_aq_layout_or
+{
+    using type = Default;
+};
+
+template <typename T, typename Default>
+struct get_aq_layout_or<T, Default, std::void_t<typename T::AQLayout>>
+{
+    using type = typename T::AQLayout;
+};
+
+template <typename, typename Default, typename = void>
+struct get_bq_layout_or
+{
+    using type = Default;
+};
+
+template <typename T, typename Default>
+struct get_bq_layout_or<T, Default, std::void_t<typename T::BQLayout>>
+{
+    using type = typename T::BQLayout;
+};
+
+template <typename, typename Default, typename = void>
+struct get_aq_data_type_or
+{
+    using type = Default;
+};
+
+template <typename T, typename Default>
+struct get_aq_data_type_or<T, Default, std::void_t<typename T::AQDataType>>
+{
+    using type = typename T::AQDataType;
+};
+
+template <typename, typename Default, typename = void>
+struct get_bq_data_type_or
+{
+    using type = Default;
+};
+
+template <typename T, typename Default>
+struct get_bq_data_type_or<T, Default, std::void_t<typename T::BQDataType>>
+{
+    using type = typename T::BQDataType;
+};
+
+template <typename, typename = void>
+struct is_quantpreshuffle_enabled
+{
+    static constexpr bool value = false;
+};
+
+template <typename T>
+struct is_quantpreshuffle_enabled<T, decltype(T::PreshuffleQuant)>
+{
+    static constexpr bool value = T::PreshuffleQuant;
+};
+
+template <typename, typename = void>
+struct is_preshuffleB_enabled
+{
+    static constexpr bool value = false;
+};
+
+template <typename T>
+struct is_preshuffleB_enabled<T, std::void_t<decltype(T::PreshuffleB)>>
+{
+    static constexpr bool value = T::PreshuffleB;
+};
+} // namespace detail
+
+struct QuantGemmProblem
+{
+    CK_TILE_HOST QuantGemmProblem() = default;
+    CK_TILE_HOST QuantGemmProblem(index_t M_,
+                                  index_t N_,
+                                  index_t K_,
+                                  index_t QK_A_,
+                                  index_t QK_B_,
+                                  index_t stride_A_,
+                                  index_t stride_B_,
+                                  index_t stride_C_,
+                                  index_t stride_AQ_,
+                                  index_t stride_BQ_)
+        : M(M_),
+          N(N_),
+          K(K_),
+          QK_A(QK_A_),
+          QK_B(QK_B_),
+          stride_A(stride_A_),
+          stride_B(stride_B_),
+          stride_C(stride_C_),
+          stride_AQ(stride_AQ_),
+          stride_BQ(stride_BQ_)
+    {
+    }
+
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t QK_A;
+    index_t QK_B;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+    index_t stride_AQ;
+    index_t stride_BQ;
+};
+
+struct QuantGemmHostArgs : public QuantGemmProblem
+{
+    CK_TILE_HOST QuantGemmHostArgs() = default;
+    CK_TILE_HOST QuantGemmHostArgs(const void* a_ptr_,
+                                   const void* b_ptr_,
+                                   void* c_ptr_,
+                                   const void* aq_ptr_,
+                                   const void* bq_ptr_,
+                                   index_t k_batch_,
+                                   index_t M_,
+                                   index_t N_,
+                                   index_t K_,
+                                   index_t QK_A_,
+                                   index_t QK_B_,
+                                   index_t stride_A_,
+                                   index_t stride_B_,
+                                   index_t stride_C_,
+                                   index_t stride_AQ_,
+                                   index_t stride_BQ_)
+        : QuantGemmProblem(
+              M_, N_, K_, QK_A_, QK_B_, stride_A_, stride_B_, stride_C_, stride_AQ_, stride_BQ_),
+          a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          aq_ptr(aq_ptr_),
+          bq_ptr(bq_ptr_),
+          c_ptr(c_ptr_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* a_ptr  = nullptr;
+    const void* b_ptr  = nullptr;
+    const void* aq_ptr = nullptr;
+    const void* bq_ptr = nullptr;
+    void* c_ptr        = nullptr;
+    index_t k_batch    = 0;
+};
+
+struct QuantGemmKernelArgs
+{
+    const void* a_ptr;
+    const void* b_ptr;
+    const void* aq_ptr;
+    const void* bq_ptr;
+    void* c_ptr;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t QK_A;
+    index_t QK_B;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+    index_t stride_AQ;
+    index_t stride_BQ;
+    index_t k_batch;
+};
+
+template <typename TilePartitioner_,
+          typename GemmPipeline_,
+          typename EpiloguePipeline_,
+          QuantType QuantType_>
+struct QuantGemmKernel
+{
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout          = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout          = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout          = remove_cvref_t<typename GemmPipeline::CLayout>;
+
+    using AQLayout = remove_cvref_t<
+        typename detail::get_aq_layout_or<GemmPipeline, typename GemmPipeline::ALayout>::type>;
+    using BQLayout = remove_cvref_t<
+        typename detail::get_bq_layout_or<GemmPipeline, typename GemmPipeline::BLayout>::type>;
+
+    static constexpr index_t kBlockSize = GemmPipeline::BlockSize;
+    static constexpr bool PreshuffleQuant =
+        detail::is_quantpreshuffle_enabled<GemmPipeline_>::value;
+    static constexpr bool PreshuffleB = detail::is_preshuffleB_enabled<GemmPipeline_>::value;
+
+    using ADataType   = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType   = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using CDataType   = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using AccDataType = remove_cvref_t<typename EpiloguePipeline::AccDataType>;
+
+    using AQDataType =
+        remove_cvref_t<typename detail::get_aq_data_type_or<GemmPipeline, AccDataType>::type>;
+    using BQDataType =
+        remove_cvref_t<typename detail::get_bq_data_type_or<GemmPipeline, AccDataType>::type>;
+
+    static constexpr auto I0 = number<0>(); // A Tensor
+    static constexpr auto I1 = number<1>(); // AQ Tensor
+    static constexpr auto I2 = number<2>(); // B Tensor
+    static constexpr auto I3 = number<3>(); // BQ Tensor
+    static constexpr auto I4 = number<4>(); // C Tensor
+
+    static constexpr auto kQuantType = QuantType_;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "gemm_quant", gemm_prec_str<ADataType, BDataType>, GemmPipeline::GetName());
+        // clang-format on
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+
+    CK_TILE_HOST static constexpr QuantGemmKernelArgs
+    MakeKernelArgs(const QuantGemmHostArgs& hostArgs)
+    {
+        return QuantGemmKernelArgs{hostArgs.a_ptr,
+                                   hostArgs.b_ptr,
+                                   hostArgs.aq_ptr,
+                                   hostArgs.bq_ptr,
+                                   hostArgs.c_ptr,
+                                   hostArgs.M,
+                                   hostArgs.N,
+                                   hostArgs.K,
+                                   hostArgs.QK_A,
+                                   hostArgs.QK_B,
+                                   hostArgs.stride_A,
+                                   hostArgs.stride_B,
+                                   hostArgs.stride_C,
+                                   hostArgs.stride_AQ,
+                                   hostArgs.stride_BQ,
+                                   hostArgs.k_batch};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(const QuantGemmKernelArgs& kargs,
+                                     const std::size_t k_id = blockIdx.z)
+        {
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(I2);
+            const index_t K_t   = amd_wave_read_first_lane(kargs.k_batch * K1);
+            const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1);
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_A);
+            }
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_B);
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
+            }
+
+            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
+            {
+                splitted_k = amd_wave_read_first_lane(KRead);
+            }
+            else
+            {
+                splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1));
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t splitted_k;
+    };
+
+    CK_TILE_HOST static bool IsSupportedArgument(const QuantGemmKernelArgs& kargs)
+    {
+        if(kargs.k_batch != 1)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
+            }
+            return false;
+        }
+
+        if constexpr(kQuantType == QuantType::AQuantGrouped)
+        {
+            static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+            if(kargs.QK_A % GemmPipeline::GetVectorSizeAQ() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("K_A is not a multiple of vector load size for A tensor!");
+                }
+                return false;
+            }
+        }
+
+        if constexpr(kQuantType == QuantType::BQuantGrouped)
+        {
+            static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+            if(kargs.QK_B % GemmPipeline::GetVectorSizeBQ() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("K_B is not a multiple of vector load size for B tensor!");
+                }
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
+               GemmPipeline::kPadK == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
+                                  "without padding!");
+                }
+                return false;
+            }
+            if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support M that is not a multiple of MPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("M is not a multiple of vector load size for A tensor!");
+                }
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support N that is not a multiple of NPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("N is not a multiple of vector load size for B tensor!");
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
+               GemmPipeline::kPadK == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
+                                  "without padding!");
+                }
+                return false;
+            }
+            if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("K is not a multiple of vector load size for B tensor!");
+                }
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support N that is not a multiple of NPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("N is not a multiple of vector load size for C tensor!");
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support M that is not a multiple of MPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("M is not a multiple of vector load size for C tensor!");
+                }
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
+                                                   const BDataType* b_ptr,
+                                                   const AQDataType* aq_ptr,
+                                                   const BQDataType* bq_ptr,
+                                                   CDataType* c_ptr,
+                                                   const QuantGemmKernelArgs& kargs,
+                                                   const SplitKBatchOffset& splitk_batch_offset)
+    {
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
+        const auto& a_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
+                    make_tuple(kargs.stride_A, 1),
+                    number<GemmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(splitk_batch_offset.splitted_k, kargs.M),
+                    make_tuple(kargs.stride_A, 1),
+                    number<GemmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+        }();
+
+        const auto get_padding_size = [](index_t length, index_t alignment) {
+            return ck_tile::integer_least_multiple(length, alignment) - length;
+        };
+
+        const auto& aq_tensor_view = [&]() {
+            if constexpr(kQuantType == QuantType::AQuantGrouped && PreshuffleQuant)
+            {
+                static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+                const auto aq_x = kargs.M * GemmPipeline::KPerBlockAQ;
+                const auto aq_y = kargs.QK_A / GemmPipeline::KPerBlockAQ;
+
+                const auto aq_desc =
+                    make_naive_tensor_descriptor(make_tuple(aq_y, aq_x),
+                                                 make_tuple(aq_x, 1),
+                                                 number<GemmPipeline::GetVectorSizeAQ()>{},
+                                                 number<1>{});
+
+                const auto block_tile_size = GemmPipeline::MPerBlock * GemmPipeline::KPerBlockAQ;
+                const auto aq_pad0_desc    = transform_tensor_descriptor(
+                    aq_desc,
+                    make_tuple(
+                        make_pass_through_transform(aq_y),
+                        make_right_pad_transform(aq_x, get_padding_size(aq_x, block_tile_size))),
+                    make_tuple(sequence<0>{}, sequence<1>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+
+                const auto pad_aq_x = aq_pad0_desc.get_lengths()[I1];
+                const auto wave_tile_size =
+                    TilePartitioner::BlockGemmShape::WarpTile::at(I0) * GemmPipeline::KPerBlockAQ;
+                const auto wave_tile_count_x =
+                    ck_tile::integer_divide_ceil(pad_aq_x, wave_tile_size);
+                const auto aq_unmerge_pad0_desc = transform_tensor_descriptor(
+                    aq_pad0_desc,
+                    make_tuple(
+                        make_pass_through_transform(aq_y),
+                        make_unmerge_transform(make_tuple(wave_tile_count_x, wave_tile_size))),
+                    make_tuple(sequence<0>{}, sequence<1>{}),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                const auto aq_pad1_desc = transform_tensor_descriptor(
+                    aq_unmerge_pad0_desc,
+                    make_tuple(
+                        make_pass_through_transform(aq_y),
+                        make_pass_through_transform(wave_tile_count_x),
+                        make_right_pad_transform(
+                            wave_tile_size, get_padding_size(wave_tile_size, get_warp_size()))),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+                const auto pad_wave_size =
+                    ck_tile::integer_least_multiple(wave_tile_size, get_warp_size());
+                const auto aq_merge_pad1_desc = transform_tensor_descriptor(
+                    aq_pad1_desc,
+                    make_tuple(make_merge_transform(make_tuple(aq_y, wave_tile_count_x)),
+                               make_pass_through_transform(pad_wave_size)),
+                    make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+
+                return make_tensor_view<address_space_enum::global>(aq_ptr, aq_merge_pad1_desc);
+            }
+            else if constexpr(kQuantType == QuantType::AQuantGrouped && !PreshuffleQuant)
+            {
+                static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+                return make_naive_tensor_view<address_space_enum::global>(
+                    aq_ptr,
+                    make_tuple(kargs.M, kargs.QK_A),
+                    make_tuple(kargs.stride_AQ, 1),
+                    number<GemmPipeline::GetVectorSizeAQ()>{},
+                    number<1>{});
+            }
+            else if constexpr(kQuantType == QuantType::RowColQuant)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    aq_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, 0), // broadcasting over n
+                    number<1>{},
+                    number<1>{});
+            }
+            else
+            {
+                return nullptr; // TODO: use some other "empty" type for this
+            }
+        }();
+
+        const auto& b_tensor_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
+                {
+                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
+                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
+                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
+                    const auto b_k0_n_k1_desc =
+                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
+                                                     make_tuple(kargs.N * K1, K1, I1),
+                                                     number<VectorSizeB>{},
+                                                     number<1>{});
+                    const auto b_n_k_desc = transform_tensor_descriptor(
+                        b_k0_n_k1_desc,
+                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                                   make_pass_through_transform(kargs.N)),
+                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        b_ptr,
+                        make_tuple(splitk_batch_offset.splitted_k, kargs.N),
+                        make_tuple(kargs.stride_B, 1),
+                        number<GemmPipeline::GetVectorSizeB()>{},
+                        number<1>{});
+                }
+            }
+            else
+            {
+                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
+                {
+                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
+                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
+                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
+                    const auto b_k0_n_k1_desc =
+                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
+                                                     make_tuple(kargs.N * K1, K1, I1),
+                                                     number<VectorSizeB>{},
+                                                     number<1>{});
+                    const auto b_n_k_desc = transform_tensor_descriptor(
+                        b_k0_n_k1_desc,
+                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                                   make_pass_through_transform(kargs.N)),
+                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
+                        make_tuple(sequence<1>{}, sequence<0>{}));
+                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
+                }
+                else
+                {
+                    if constexpr(PreshuffleB)
+                    {
+                        index_t kFlatK =
+                            GemmPipeline::flatKPerWarp *
+                            (splitk_batch_offset.splitted_k /
+                             TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}));
+                        index_t kFlatN = kargs.N * kargs.K / kFlatK;
+
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            b_ptr,
+                            make_tuple(kFlatN, kFlatK),
+                            make_tuple(kFlatK, 1),
+                            number<GemmPipeline::GetVectorSizeB()>{},
+                            number<1>{});
+                    }
+                    else
+                    {
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            b_ptr,
+                            make_tuple(kargs.N, splitk_batch_offset.splitted_k),
+                            make_tuple(kargs.stride_B, 1),
+                            number<GemmPipeline::GetVectorSizeB()>{},
+                            number<1>{});
+                    }
+                }
+            }
+        }();
+
+        const auto& bq_tensor_view = [&]() {
+            if constexpr(kQuantType == QuantType::RowColQuant)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    bq_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(0, 1), // broadcasting over m
+                    number<1>{},
+                    number<1>{});
+            }
+            else if constexpr(kQuantType == QuantType::BQuantGrouped)
+            {
+                static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+                return make_naive_tensor_view<address_space_enum::global>(
+                    bq_ptr,
+                    make_tuple(kargs.N, kargs.QK_B),
+                    make_tuple(kargs.stride_BQ, 1),
+                    number<GemmPipeline::GetVectorSizeBQ()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return nullptr; // TODO: use some other "empty" type for this
+            }
+        }();
+
+        // TODO: enable vector write for C in ColMajor
+        const auto& c_tensor_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.stride_C, 1),
+                    number<EpiloguePipeline::GetVectorSizeC()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        return make_tuple(
+            a_tensor_view, aq_tensor_view, b_tensor_view, bq_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                  number<TilePartitioner::MPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadM>{});
+            }
+        }();
+
+        // no padding
+        const auto& aq_pad_view = [&]() { return views.at(I1); }();
+
+        const auto& b_flat_view = views.at(I2); // not applying any padding to flat B view
+
+        const auto& b_pad_view = [&]() {
+            const auto& b_tensor_view = views.at(I2);
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                return pad_tensor_view(b_tensor_view,
+                                       make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(b_tensor_view,
+                                       make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadN>{});
+            }
+        }();
+
+        // no padding
+        const auto& bq_pad_view = [&]() { return views.at(I3); }();
+
+        // TODO vector write in for C in ColMajor
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I4);
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+        if constexpr(PreshuffleB)
+        {
+            return make_tuple(a_pad_view, aq_pad_view, b_flat_view, bq_pad_view, c_pad_view);
+        }
+        else
+        {
+            return make_tuple(a_pad_view, aq_pad_view, b_pad_view, bq_pad_view, c_pad_view);
+        }
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    {
+        const auto& a_pad_view     = views.at(I0);
+        const auto& aq_pad_view    = views.at(I1);
+        const auto& b_pad_view     = views.at(I2);
+        const auto& bq_pad_view    = views.at(I3);
+        const auto& c_pad_view     = views.at(I4);
+        const auto& a_block_window = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_tile_window(a_pad_view,
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::KPerBlock>{}),
+                                        {i_m, 0});
+            }
+            else
+            {
+                return make_tile_window(a_pad_view,
+                                        make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                   number<TilePartitioner::MPerBlock>{}),
+                                        {0, i_m});
+            }
+        }();
+
+        const auto& aq_block_window = [&]() {
+            if constexpr(kQuantType == QuantType::AQuantGrouped && PreshuffleQuant)
+            {
+                static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+                constexpr auto block_m = TilePartitioner::MPerBlock;
+                constexpr auto warp_m  = TilePartitioner::BlockGemmShape::WarpTile::at(I0);
+                constexpr auto aqk_per_block =
+                    TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize;
+                constexpr auto tile_window_width =
+                    ck_tile::integer_least_multiple(warp_m * aqk_per_block, get_warp_size());
+                constexpr auto tile_window_height = block_m / warp_m;
+                auto block_m_idx                  = i_m / block_m;
+                return make_tile_window(
+                    aq_pad_view,
+                    make_tuple(number<tile_window_height>{}, number<tile_window_width>{}),
+                    {block_m_idx * tile_window_height, 0});
+            }
+            else if constexpr(kQuantType == QuantType::AQuantGrouped && !PreshuffleQuant)
+            {
+                static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+                constexpr auto block_m = TilePartitioner::MPerBlock;
+                constexpr auto block_k = TilePartitioner::KPerBlock;
+                return make_tile_window(
+                    aq_pad_view,
+                    make_tuple(number<block_m>{}, number<block_k / GemmPipeline::QuantGroupSize>{}),
+                    {i_m, 0});
+            }
+            else if constexpr(kQuantType == QuantType::RowColQuant)
+            {
+                return make_tile_window(aq_pad_view,
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::NPerBlock>{}),
+                                        {i_m, i_n});
+            }
+            else
+            {
+                return nullptr; // TODO: use some other "empty" type?
+            }
+        }();
+
+        const auto& b_block_window = [&]() {
+            if constexpr(PreshuffleB)
+            {
+                return make_tile_window(
+                    b_pad_view,
+                    make_tuple(number<GemmPipeline::flatNPerWarp>{},
+                               number<GemmPipeline::flatKPerWarp>{}),
+                    {static_cast<int>(i_n / TilePartitioner::BlockGemmShape::WarpTile::at(I1)), 0});
+            }
+            else
+            {
+                if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+                {
+                    return make_tile_window(b_pad_view,
+                                            make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                       number<TilePartitioner::KPerBlock>{}),
+                                            {i_n, 0});
+                }
+                else
+                {
+                    return make_tile_window(b_pad_view,
+                                            make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                       number<TilePartitioner::NPerBlock>{}),
+                                            {0, i_n});
+                }
+            }
+        }();
+
+        const auto& bq_block_window = [&]() {
+            if constexpr(kQuantType == QuantType::RowColQuant)
+            {
+                return make_tile_window(bq_pad_view,
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::NPerBlock>{}),
+                                        {i_m, i_n});
+            }
+            else if constexpr(kQuantType == QuantType::BQuantGrouped)
+            {
+                static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+                return make_tile_window(
+                    bq_pad_view,
+                    make_tuple(number<TilePartitioner::NPerBlock>{},
+                               number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
+                    {i_n, 0});
+            }
+            else
+            {
+                return nullptr; // TODO: use some other "empty" type here
+            }
+        }();
+
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(
+            a_block_window, aq_block_window, b_block_window, bq_block_window, c_block_window);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param aq_ptr input AQ pointer
+     * @param bq_ptr input BQ pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     * @tparam DstInMemOp Destination memory operation (default: set).
+     */
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
+                                       const BDataType* b_ptr,
+                                       const AQDataType* aq_ptr,
+                                       const BQDataType* bq_ptr,
+                                       CDataType* c_ptr,
+                                       void* smem_ptr_0,
+                                       const QuantGemmKernelArgs& kargs,
+                                       const SplitKBatchOffset& splitk_batch_offset,
+                                       const index_t block_idx_m,
+                                       const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple = MakeGemmTensorViews<DstInMemOp>(
+            a_ptr, b_ptr, aq_ptr, bq_ptr, c_ptr, kargs, splitk_batch_offset);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = [&]() {
+            if constexpr(kQuantType == QuantType::AQuantGrouped)
+            {
+                const auto& aq_block_window = gemm_tile_windows.at(I1);
+                return GemmPipeline{}.template operator()(
+                    a_block_window, b_block_window, aq_block_window, kargs.M, num_loop, smem_ptr_0);
+            }
+            else if constexpr(kQuantType == QuantType::BQuantGrouped)
+            {
+                const auto& bq_block_window = gemm_tile_windows.at(I3);
+                return GemmPipeline{}.template operator()(
+                    a_block_window, b_block_window, bq_block_window, num_loop, smem_ptr_0);
+            }
+            else if constexpr(kQuantType == QuantType::RowColQuant ||
+                              kQuantType == QuantType::TensorQuant)
+            {
+                return GemmPipeline{}.template operator()(
+                    a_block_window, b_block_window, num_loop, smem_ptr_0);
+            }
+        }();
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I4);
+
+        if constexpr(kQuantType == QuantType::AQuantGrouped ||
+                     kQuantType == QuantType::BQuantGrouped)
+        {
+            EpiloguePipeline{}(c_block_window, c_block_tile, c_block_window, smem_ptr_0);
+        }
+        else if constexpr(kQuantType == QuantType::RowColQuant)
+        {
+            const auto& aq_block_window = gemm_tile_windows.at(I1);
+            const auto& bq_block_window = gemm_tile_windows.at(I3);
+            EpiloguePipeline{}(c_block_window,
+                               c_block_tile,
+                               c_block_window,
+                               smem_ptr_0,
+                               aq_block_window,
+                               bq_block_window);
+        }
+        else if constexpr(kQuantType == QuantType::TensorQuant)
+        {
+            // TODO: why doesn't readfirstlane work here?
+            // const AccDataType aq_scale =
+            //     __builtin_amdgcn_readfirstlane(type_convert<AccDataType>(*aq_ptr));
+            // const AccDataType bq_scale =
+            //     __builtin_amdgcn_readfirstlane(type_convert<AccDataType>(*bq_ptr));
+            const AccDataType aq_scale = type_convert<AccDataType>(*aq_ptr);
+            const AccDataType bq_scale = type_convert<AccDataType>(*bq_ptr);
+            EpiloguePipeline{}(
+                c_block_window, c_block_tile, c_block_window, smem_ptr_0, aq_scale, bq_scale);
+        }
+    }
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param aq_ptr input AQ pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     * @tparam DstInMemOp Destination memory operation (default: set).
+     */
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static void RunGemm2LDS(const ADataType* a_ptr,
+                                           const BDataType* b_ptr,
+                                           const AQDataType* aq_ptr,
+                                           const BQDataType* bq_ptr,
+                                           CDataType* c_ptr,
+                                           void* smem_ptr_0,
+                                           void* smem_ptr_1,
+                                           const QuantGemmKernelArgs& kargs,
+                                           const SplitKBatchOffset& splitk_batch_offset,
+                                           const index_t block_idx_m,
+                                           const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple = MakeGemmTensorViews<DstInMemOp>(
+            a_ptr, b_ptr, aq_ptr, bq_ptr, c_ptr, kargs, splitk_batch_offset);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = [&]() {
+            if constexpr(kQuantType == QuantType::BQuantGrouped)
+            {
+                const auto& bq_block_window = gemm_tile_windows.at(I3);
+                return GemmPipeline{}.template operator()(a_block_window,
+                                                          b_block_window,
+                                                          bq_block_window,
+                                                          num_loop,
+                                                          smem_ptr_0,
+                                                          smem_ptr_1);
+            }
+            else
+            {
+                return nullptr;
+            }
+        }();
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I4);
+
+        if constexpr(kQuantType == QuantType::BQuantGrouped)
+        {
+            EpiloguePipeline{}(c_block_window, c_block_tile, c_block_window, smem_ptr_0);
+        }
+        else
+        {
+            return;
+            // throw std::runtime_error("DoubleSmemBuffer Not implemented for AQuantGrouped or
+            // RowColQuant"); static_assert(kQuantType == QuantType::BQuantGrouped,
+            // "DoubleSmemBuffer Not implemented");
+        }
+    }
+
+    CK_TILE_DEVICE void operator()(QuantGemmKernelArgs kargs) const
+    {
+        const auto blockId  = amd_wave_read_first_lane(blockIdx.x);
+        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
+        const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
+
+        const SplitKBatchOffset splitk_batch_offset(kargs);
+        // options
+        const ADataType* a_ptr   = static_cast<const ADataType*>(kargs.a_ptr);
+        const BDataType* b_ptr   = static_cast<const BDataType*>(kargs.b_ptr);
+        const AQDataType* aq_ptr = static_cast<const AQDataType*>(kargs.aq_ptr);
+        const BQDataType* bq_ptr = static_cast<const BQDataType*>(kargs.bq_ptr);
+        CDataType* c_ptr         = static_cast<CDataType*>(kargs.c_ptr);
+
+        // allocate LDS
+        __shared__ char smem_ptr_0[GetSmemSize()];
+
+        assert(kargs.k_batch == 1);
+        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+        {
+            __shared__ char smem_ptr_1[GetSmemSize()];
+
+            RunGemm2LDS(a_ptr,
+                        b_ptr,
+                        aq_ptr,
+                        bq_ptr,
+                        c_ptr,
+                        smem_ptr_0,
+                        smem_ptr_1,
+                        kargs,
+                        splitk_batch_offset,
+                        i_m,
+                        i_n);
+        }
+        else
+        {
+            RunGemm(a_ptr,
+                    b_ptr,
+                    aq_ptr,
+                    bq_ptr,
+                    c_ptr,
+                    smem_ptr_0,
+                    kargs,
+                    splitk_batch_offset,
+                    i_m,
+                    i_n);
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp
new file mode 100644
index 0000000000..39c8e406b7
--- /dev/null
+++ b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/utility/literals.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+#include "ck_tile/host/stream_utils.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp"
+#include "ck_tile/host.hpp"
+
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+
+/// @brief The Grouped GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref GroupedGemmKernel "GroupedGemmKernel" when creating kernel
+///      arguments object. It contain all necessary information required to build proper kernel
+///      argument and launch kernel on GPU. This structure defines the GEMM problem configuration by
+///      stating all required information like M,N,K sizes and respective strides.
+struct QuantGroupedGemmHostArgs
+{
+    CK_TILE_HOST QuantGroupedGemmHostArgs(const void* a_ptr_,
+                                          const void* b_ptr_,
+                                          void* e_ptr_,
+                                          const void* aq_ptr_,
+                                          const void* bq_ptr_,
+                                          index_t k_batch_,
+                                          index_t M_,
+                                          index_t N_,
+                                          index_t K_,
+                                          index_t QK_A_,
+                                          index_t QK_B_,
+                                          index_t stride_A_,
+                                          index_t stride_B_,
+                                          index_t stride_E_,
+                                          index_t stride_AQ_,
+                                          index_t stride_BQ_)
+        : a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          aq_ptr(aq_ptr_),
+          bq_ptr(bq_ptr_),
+          e_ptr(e_ptr_),
+          M(M_),
+          N(N_),
+          K(K_),
+          QK_A(QK_A_),
+          QK_B(QK_B_),
+          stride_A(stride_A_),
+          stride_B(stride_B_),
+          stride_AQ(stride_AQ_),
+          stride_BQ(stride_BQ_),
+          stride_E(stride_E_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* a_ptr;
+    const void* b_ptr;
+    const void* aq_ptr;
+    const void* bq_ptr;
+    union
+    {
+        void* e_ptr;
+        void* c_ptr;
+    };
+
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t QK_A;
+    index_t QK_B;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_AQ;
+    index_t stride_BQ;
+
+    union
+    {
+        index_t stride_E;
+        index_t stride_C;
+    };
+
+    index_t k_batch;
+};
+
+using QuantGroupedGemmKernelArgs = QuantGemmKernelArgs;
+
+struct QuantGemmTransKernelArg
+{
+    QuantGroupedGemmKernelArgs group_karg;
+    ck_tile::index_t block_start;
+    ck_tile::index_t block_end;
+
+    QuantGemmTransKernelArg() = delete;
+    QuantGemmTransKernelArg(QuantGroupedGemmKernelArgs&& karg, index_t bl_start, index_t bl_end)
+        : group_karg{karg}, block_start{bl_start}, block_end{bl_end}
+    {
+    }
+
+    QuantGemmTransKernelArg(QuantGroupedGemmKernelArgs&& karg)
+        : group_karg{karg}, block_start{0}, block_end{0}
+    {
+    }
+};
+
+template <typename TilePartitioner_,
+          typename GemmPipeline_,
+          typename EpiloguePipeline_,
+          QuantType QuantType_>
+struct QuantGroupedGemmKernel
+{
+    /// @brief Inject the UniversalGemmKernel base class to support execution of all necessary
+    /// functions.
+    using Base = QuantGemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_, QuantType_>;
+
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+
+    //// @brief Specify the layout configurations for A, B, C/E
+    using ALayout = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout = remove_cvref_t<typename GemmPipeline::CLayout>;
+
+    /// @brief Specify the data type configurations for A, B, C/E
+    using ADataType   = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType   = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using CDataType   = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using AccDataType = remove_cvref_t<typename EpiloguePipeline::AccDataType>;
+
+    using AQDataType =
+        remove_cvref_t<typename detail::get_aq_data_type_or<GemmPipeline, AccDataType>::type>;
+    using BQDataType =
+        remove_cvref_t<typename detail::get_bq_data_type_or<GemmPipeline, AccDataType>::type>;
+
+    static constexpr auto kQuantType = QuantType_;
+
+    /// @brief ALayout and ADataType are expected to be scalars, not a tuple.
+    static_assert(
+        !is_detected<is_tuple, ALayout>::value && !is_detected<is_tuple, ADataType>::value,
+        "ALayout and ADataType must be scalars. Multiple parameters are not currently supported.");
+
+    /// @brief  BLayout and BDataType are expected to be scalars, not a tuple.
+    static_assert(
+        !is_detected<is_tuple, BLayout>::value && !is_detected<is_tuple, BDataType>::value,
+        "BLayout and BDataType must be scalars. Multiple parameters are not currently supported.");
+
+    /// @brief  C/ELayout and C/EDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, CLayout>::value &&
+                      !is_detected<is_tuple, CDataType>::value,
+                  "C/ELayout and C/EDataType must be scalars.");
+
+    using OffsetTile1DPartitioner = OffsettedTile1DPartitioner<TilePartitioner>;
+    using Kernel =
+        QuantGroupedGemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline, kQuantType>;
+
+    static constexpr index_t kBlockSize       = GemmPipeline::BlockSize;
+    static constexpr bool UsePersistentKernel = GemmPipeline::UsePersistentKernel;
+    static_assert(UsePersistentKernel == true, "UsePersistentKernel must be true");
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        using P_ = GemmPipeline;
+
+        return concat('_', "gemm_grouped", gemm_prec_str<ADataType, BDataType>(),
+                      concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock),
+                      concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
+                      concat('x', P_::kPadM, P_::kPadN, P_::kPadK),
+                      (UsePersistentKernel ? "Persistent" : "NonPersistent"));
+        // clang-format on
+    }
+
+    CK_TILE_HOST static auto
+    GetWorkSpaceSize(const std::vector<QuantGroupedGemmHostArgs>& gemm_descs) -> std::size_t
+    {
+        return gemm_descs.size() * sizeof(QuantGemmTransKernelArg);
+    }
+
+    CK_TILE_HOST static auto GetWorkSpaceSize(index_t group_count) -> std::size_t
+    {
+        return group_count * sizeof(QuantGemmTransKernelArg);
+    }
+
+    CK_TILE_HOST static auto BlockSize() -> dim3
+    {
+        if(is_wave32())
+        {
+            return dim3(kBlockSize / 2);
+        }
+        else
+        {
+            return dim3(kBlockSize);
+        }
+    }
+
+    /**
+     * @brief Get the maximum occupancy grid size for the persistent kernel on the current device.
+     * @return The maximum occupancy grid size.
+     * @note This function queries the maximum occupancy of the kernel using
+     *       `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
+     */
+    CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
+    {
+        using ConstantPointer  = const void CK_CONSTANT_ADDRESS_SPACE*;
+        const auto kernel_func = kentry<1, Kernel, ConstantPointer, index_t>;
+        int occupancy;
+        HIP_CHECK_ERROR(
+            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel_func, kBlockSize, 0));
+        const int grid_size = get_available_compute_units(s) * occupancy;
+        return dim3(grid_size, 1, 1);
+    }
+
+    CK_TILE_HOST static auto GridSize(const std::vector<QuantGroupedGemmHostArgs>& gemm_descs)
+    {
+        index_t grid_size = 0;
+        for(const auto& it_desc : gemm_descs)
+        {
+            const auto local_grid_size = TilePartitioner::GridSize(it_desc.M, it_desc.N);
+            grid_size += local_grid_size * it_desc.k_batch;
+        }
+        return dim3(grid_size, 1, 1);
+    }
+
+    CK_TILE_HOST static auto MakeKargs(const std::vector<QuantGroupedGemmHostArgs>& gemm_descs)
+        -> std::vector<QuantGemmTransKernelArg>
+    {
+        std::vector<QuantGemmTransKernelArg> gemm_kernel_args_;
+        index_t group_count = ck_tile::type_convert<ck_tile::index_t>(gemm_descs.size());
+        index_t grid_size   = 0;
+        gemm_kernel_args_.reserve(group_count);
+
+        for(std::size_t i = 0; i < gemm_descs.size(); ++i)
+        {
+            const index_t M = gemm_descs[i].M;
+            const index_t N = gemm_descs[i].N;
+            const index_t K = gemm_descs[i].K;
+
+            if(M == 0 || N == 0 || K == 0)
+            {
+                continue;
+            }
+
+            const index_t stride_a = gemm_descs[i].stride_A;
+            const index_t stride_b = gemm_descs[i].stride_B;
+            const index_t stride_e = gemm_descs[i].stride_C;
+
+            const index_t grid_size_grp = TilePartitioner::GridSize(M, N) * gemm_descs[i].k_batch;
+
+            const index_t block_start = grid_size;
+            const index_t block_end   = grid_size + grid_size_grp;
+
+            grid_size += grid_size_grp;
+
+            auto karg =
+                QuantGroupedGemmKernelArgs{type_convert<const ADataType*>(gemm_descs[i].a_ptr),
+                                           type_convert<const BDataType*>(gemm_descs[i].b_ptr),
+                                           type_convert<CDataType*>(gemm_descs[i].e_ptr),
+                                           type_convert<const AQDataType*>(gemm_descs[i].aq_ptr),
+                                           type_convert<const BQDataType*>(gemm_descs[i].bq_ptr),
+                                           gemm_descs[i].k_batch,
+                                           M,
+                                           N,
+                                           K,
+                                           gemm_descs[i].QK_A,
+                                           gemm_descs[i].QK_B,
+                                           stride_a,
+                                           stride_b,
+                                           stride_e,
+                                           gemm_descs[i].stride_AQ,
+                                           gemm_descs[i].stride_BQ};
+
+            gemm_kernel_args_.emplace_back(std::move(karg), block_start, block_end);
+        }
+
+        return gemm_kernel_args_;
+    }
+
+    CK_TILE_HOST static bool IsSupportedArgument(const std::vector<QuantGemmTransKernelArg>& kargs)
+    {
+        for(const auto& karg : kargs)
+        {
+            if(!Base::IsSupportedArgument(karg.group_karg))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize() -> index_t
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    CK_TILE_DEVICE void Run(const QuantGroupedGemmKernelArgs& kargs,
+                            const tuple<index_t, index_t>& block_idx_2d,
+                            const index_t block_idx_z) const
+    {
+        const auto [iM, iN] = block_idx_2d;
+
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
+
+        const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z);
+
+        // options
+        const ADataType* a_ptr   = static_cast<const ADataType*>(kargs.a_ptr);
+        const BDataType* b_ptr   = static_cast<const BDataType*>(kargs.b_ptr);
+        const AQDataType* aq_ptr = static_cast<const AQDataType*>(kargs.aq_ptr);
+        const BQDataType* bq_ptr = static_cast<const BQDataType*>(kargs.bq_ptr);
+        CDataType* c_ptr         = static_cast<CDataType*>(kargs.c_ptr);
+
+        static_assert(GemmPipeline::DoubleSmemBuffer == false,
+                      "DoubleSmemBuffer needs to be false");
+        // allocate LDS
+        __shared__ char smem_ptr_0[GetSmemSize()];
+
+        RunGemmWithPipelineSelection(
+            a_ptr, b_ptr, aq_ptr, bq_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note The GEMM pipeline is selected in-kernel based on the number of K-loops
+     *       and the tail-number. This is needed for the persistent tile-loop when
+     *       we didn't have access to the K dimension on the host.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param aq_ptr input AQ pointer
+     * @param bq_ptr input BQ pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k
+     * batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void
+    RunGemmWithPipelineSelection(const ADataType* a_ptr,
+                                 const BDataType* b_ptr,
+                                 const AQDataType* aq_ptr,
+                                 const BQDataType* bq_ptr,
+                                 CDataType* c_ptr,
+                                 void* smem_ptr_0,
+                                 const QuantGroupedGemmKernelArgs& kargs,
+                                 const typename Base::SplitKBatchOffset& splitk_batch_offset,
+                                 const index_t block_idx_m,
+                                 const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, aq_ptr, bq_ptr, c_ptr, kargs, splitk_batch_offset);
+
+        const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows =
+            Base::MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+        const auto& a_block_window = gemm_tile_windows.at(Base::I0);
+        const auto& b_block_window = gemm_tile_windows.at(Base::I2);
+
+        // Get hot-loop and tail configuration
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
+        const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        // Run GEMM pipeline
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, has_hot_loop, tail_num, smem_ptr_0);
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(Base::I4);
+        if constexpr(kQuantType == QuantType::RowColQuant)
+        {
+            const auto& aq_block_window = gemm_tile_windows.at(Base::I1);
+            const auto& bq_block_window = gemm_tile_windows.at(Base::I3);
+            EpiloguePipeline{}.template
+            operator()<decltype(c_block_window), decltype(c_block_tile), decltype(c_block_window)>(
+                c_block_window,
+                c_block_tile,
+                c_block_window,
+                smem_ptr_0,
+                aq_block_window,
+                bq_block_window);
+        }
+    }
+
+    // For persistent kernels
+    template <bool U   = UsePersistentKernel,
+              typename = std::enable_if_t<U>,
+              typename = void> // extra template parameter to avoid redefinition
+    CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                   const index_t group_count) const
+    {
+        const index_t grid_size  = ck_tile::get_grid_size();
+        const auto gemm_desc_ptr = reinterpret_cast<const QuantGemmTransKernelArg*>(
+            cast_pointer_to_generic_address_space(gemm_descs_const));
+        index_t block_id      = ck_tile::get_block_1d_id(); // initial block_id
+        index_t cum_grid_size = 0;
+        for(index_t group_id = 0; group_id < group_count; ++group_id)
+        {
+            const auto& kargs      = gemm_desc_ptr[group_id].group_karg;
+            const auto& k_batch    = kargs.k_batch;
+            const auto block_start = cum_grid_size;
+            cum_grid_size += TilePartitioner::GridSize(kargs.M, kargs.N) * k_batch;
+            while(block_id < cum_grid_size)
+            {
+                const auto grid_size_2d = TilePartitioner::GridSize(kargs.M, kargs.N);
+                const auto block_idx_2d = OffsetTile1DPartitioner::GetOffsetedTileIndex(
+                    0, kargs.M, kargs.N, (block_id - block_start) % grid_size_2d);
+                Run(kargs, block_idx_2d, (block_id - block_start) / grid_size_2d);
+                block_id = block_id + grid_size; // advance to next block
+                // NOTE: this check is redundant but helps the compiler avoid spilling some VGPR
+                if(block_id >= cum_grid_size)
+                {
+                    break; // exit the loop if all blocks are processed
+                }
+            }
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
similarity index 100%
rename from include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
rename to include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
similarity index 80%
rename from include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
rename to include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
index 52c99f8e99..926f63b5a9 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
@@ -55,44 +55,43 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
         static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
         if constexpr(PreshuffleQuant)
         {
-            using TileEncodingPattern =
-                TileDistributionEncodingPatternAQ<BlockGemmShape,
-                                                  WarpGemm,
-                                                  BlockSize,
-                                                  MPerBlock / WarpGemm::kM,
-                                                  ck_tile::integer_least_multiple(
-                                                      WarpGemm::kM * KPerBlockAQ, get_warp_size()),
-                                                  KPerBlockAQ,
-                                                  VecLoadSize,
-                                                  PreshuffleQuant>;
+            using TileEncodingPattern = tile_distribution_encoding_pattern_aq<
+                BlockGemmShape,
+                WarpGemm,
+                BlockSize,
+                MPerBlock / WarpGemm::kM,
+                ck_tile::integer_least_multiple(WarpGemm::kM * KPerBlockAQ, get_warp_size()),
+                KPerBlockAQ,
+                VecLoadSize,
+                PreshuffleQuant>;
 
-            return TileEncodingPattern::Make2DStaticTileDistribution();
+            return TileEncodingPattern::make_2d_static_tile_distribution();
         }
         else
         {
             if constexpr(Problem::TransposeC)
             {
                 using TileEncodingPatternTransposeC =
-                    TileDistributionEncodingPatternAQTransposedC<BlockGemmShape,
-                                                                 WarpGemm,
-                                                                 BlockSize,
-                                                                 MPerBlock,
-                                                                 KPerBlockAQ,
-                                                                 VecLoadSize>;
-                return TileEncodingPatternTransposeC::Make2DStaticTileDistribution();
+                    tile_distribution_encoding_pattern_aq_transposed_c<BlockGemmShape,
+                                                                       WarpGemm,
+                                                                       BlockSize,
+                                                                       MPerBlock,
+                                                                       KPerBlockAQ,
+                                                                       VecLoadSize>;
+                return TileEncodingPatternTransposeC::make_2d_static_tile_distribution();
             }
             else
             {
-                using TileEncodingPattern = TileDistributionEncodingPatternAQ<BlockGemmShape,
-                                                                              WarpGemm,
-                                                                              BlockSize,
-                                                                              MPerBlock,
-                                                                              KPerBlockAQ,
-                                                                              KPerBlockAQ,
-                                                                              VecLoadSize,
-                                                                              PreshuffleQuant>;
+                using TileEncodingPattern = tile_distribution_encoding_pattern_aq<BlockGemmShape,
+                                                                                  WarpGemm,
+                                                                                  BlockSize,
+                                                                                  MPerBlock,
+                                                                                  KPerBlockAQ,
+                                                                                  KPerBlockAQ,
+                                                                                  VecLoadSize,
+                                                                                  PreshuffleQuant>;
 
-                return TileEncodingPattern::Make2DStaticTileDistribution();
+                return TileEncodingPattern::make_2d_static_tile_distribution();
             }
         }
     }
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
similarity index 98%
rename from include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
rename to include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
index 037cef0553..24254013a4 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
@@ -9,7 +9,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/host/concat.hpp"
 
 namespace ck_tile {
@@ -330,7 +330,7 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
             if constexpr(is_a_col_major)
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
-                    Policy::template MakeShuffled2DStaticTileDistribution<Problem>());
+                    Policy::template make_shuffled_2d_static_tile_distribution<Problem>());
                 transpose_tile2d(a_shuffle_tmp, a_block_tile);
                 Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
             }
@@ -342,7 +342,7 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
             if constexpr(is_b_row_major)
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
-                    Policy::template MakeShuffled2DStaticTileDistribution<Problem>());
+                    Policy::template make_shuffled_2d_static_tile_distribution<Problem>());
                 transpose_tile2d(b_shuffle_tmp, b_block_tile);
                 Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
             }
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp
similarity index 100%
rename from include/ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp
rename to include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp
similarity index 94%
rename from include/ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp
rename to include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp
index ff986d86fb..eea8038edf 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp
@@ -52,14 +52,14 @@ struct GemmBQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
                                                            Problem::TransposeC>;
 
         static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-        using TileEncodingPattern = TileDistributionEncodingPatternBQ<BlockGemmShape,
-                                                                      WarpGemm,
-                                                                      BlockSize,
-                                                                      NPerBlock,
-                                                                      KPerBlockBQ,
-                                                                      VecLoadSize>;
+        using TileEncodingPattern = tile_distribution_encoding_pattern_bq<BlockGemmShape,
+                                                                          WarpGemm,
+                                                                          BlockSize,
+                                                                          NPerBlock,
+                                                                          KPerBlockBQ,
+                                                                          VecLoadSize>;
 
-        return TileEncodingPattern::Make2DStaticTileDistribution();
+        return TileEncodingPattern::make_2d_static_tile_distribution();
     }
 
     template <typename Problem>
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp
similarity index 98%
rename from include/ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp
rename to include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp
index 7ce6598b80..c27fbf5b50 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp
@@ -9,7 +9,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/host/concat.hpp"
 
 namespace ck_tile {
@@ -326,7 +326,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
             if constexpr(is_a_col_major)
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
-                    Policy::template MakeShuffled2DStaticTileDistribution<Problem>());
+                    Policy::template make_shuffled_2d_static_tile_distribution<Problem>());
                 transpose_tile2d(a_shuffle_tmp, a_block_tile);
                 Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
             }
@@ -338,7 +338,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
             if constexpr(is_b_row_major)
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
-                    Policy::template MakeShuffled2DStaticTileDistribution<Problem>());
+                    Policy::template make_shuffled_2d_static_tile_distribution<Problem>());
                 transpose_tile2d(b_shuffle_tmp, b_block_tile);
                 Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
             }
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp
similarity index 94%
rename from include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
rename to include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp
index 56a906a6bc..54b64c34be 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp
@@ -53,7 +53,7 @@ template <typename BlockGemmShape,
           index_t KPerBlockAQ,
           index_t VecSize,
           bool PreshuffleQuant>
-struct TileDistributionEncodingPatternAQ : public TileDistributionEncodingPattern
+struct tile_distribution_encoding_pattern_aq : public tile_distribution_encoding_pattern
 {
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
     static constexpr index_t warp_size = get_warp_size();
@@ -70,7 +70,7 @@ struct TileDistributionEncodingPatternAQ : public TileDistributionEncodingPatter
     // KWarps > 1 isn't supported
     static_assert(KWarps == 1);
 
-    CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
     {
         if constexpr(PreshuffleQuant)
         {
@@ -119,7 +119,8 @@ template <typename BlockGemmShape,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize>
-struct TileDistributionEncodingPatternAQTransposedC : public TileDistributionEncodingPattern
+struct tile_distribution_encoding_pattern_aq_transposed_c
+    : public tile_distribution_encoding_pattern
 {
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
@@ -152,7 +153,7 @@ struct TileDistributionEncodingPatternAQTransposedC : public TileDistributionEnc
 
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover the blocktile along Y.");
 
-    CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<NWarps, XR>,
@@ -171,7 +172,7 @@ template <typename BlockGemmShape,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize>
-struct TileDistributionEncodingPatternBQ : public TileDistributionEncodingPattern
+struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding_pattern
 {
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
@@ -204,7 +205,7 @@ struct TileDistributionEncodingPatternBQ : public TileDistributionEncodingPatter
 
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover the blocktile along Y.");
 
-    CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<MWarps, XR>,
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_quant_pipeline_problem.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_quant_pipeline_problem.hpp
similarity index 52%
rename from include/ck_tile/ops/gemm_group_quant/pipeline/gemm_quant_pipeline_problem.hpp
rename to include/ck_tile/ops/gemm_quant/pipeline/gemm_quant_pipeline_problem.hpp
index 69b8cd901e..4978e70099 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_quant_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_quant_pipeline_problem.hpp
@@ -14,6 +14,7 @@ namespace ck_tile {
 template <typename ADataType_,
           typename AQDataType_,
           typename BDataType_,
+          typename BQDataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename Traits_,
@@ -23,12 +24,12 @@ template <typename ADataType_,
           GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
           bool HasHotLoop_                 = true,
           TailNumber TailNum_              = TailNumber::Full>
-struct GemmAQuantPipelineProblemBase : public GemmPipelineProblemBase<ADataType_,
-                                                                      BDataType_,
-                                                                      CDataType_,
-                                                                      BlockGemmShape_,
-                                                                      Traits_,
-                                                                      ComputeDataType_>
+struct GemmQuantPipelineProblemBase : public GemmPipelineProblemBase<ADataType_,
+                                                                     BDataType_,
+                                                                     CDataType_,
+                                                                     BlockGemmShape_,
+                                                                     Traits_,
+                                                                     ComputeDataType_>
 {
     using Base = GemmPipelineProblemBase<ADataType_,
                                          BDataType_,
@@ -44,6 +45,7 @@ struct GemmAQuantPipelineProblemBase : public GemmPipelineProblemBase<ADataType_
     using typename Base::CDataType;
     using typename Base::ComputeDataType;
     using AQDataType = remove_cvref_t<AQDataType_>;
+    using BQDataType = remove_cvref_t<BQDataType_>;
 
     using BlockGemmShape = typename Base::BlockGemmShape;
 
@@ -51,18 +53,19 @@ struct GemmAQuantPipelineProblemBase : public GemmPipelineProblemBase<ADataType_
     using typename Base::BLayout;
     using typename Base::CLayout;
 
-    static constexpr bool TransposeC = TransposeC_;
-
+    static constexpr bool TransposeC       = TransposeC_;
+    static constexpr bool PreshuffleB      = Traits::PreshuffleB;
+    static constexpr bool DoubleSmemBuffer = Traits::DoubleSmemBuffer;
     using Base::kBlockSize;
 
     using Base::kPadK;
     using Base::kPadM;
     using Base::kPadN;
 
-    using Base::DoubleSmemBuffer;
     using Base::VectorLoadSize;
 
     using AQLayout = remove_cvref_t<typename Traits::AQLayout>;
+    using BQLayout = remove_cvref_t<typename Traits::BQLayout>;
 
     static constexpr uint32_t kQuantGroupSize = QuantGroupSize_;
     static constexpr auto Scheduler           = Scheduler_;
@@ -75,7 +78,7 @@ struct GemmAQuantPipelineProblemBase : public GemmPipelineProblemBase<ADataType_
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
-        return concat('_', "gemm_aquant_problem",
+        return concat('_', "gemm_quant_problem",
                       concat('x', VectorLoadSize, kBlockSize),
                       concat('x', kPadM, kPadN, kPadK),
                       Scheduler,
@@ -94,6 +97,13 @@ struct GemmAQuantPipelineProblemBase : public GemmPipelineProblemBase<ADataType_
         static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
         return kPadK ? 1 : GetAlignmentAQ();
     }();
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentBQ()
+    {
+        return VectorLoadSize / sizeof(BQDataType);
+    }
+
+    static constexpr index_t VectorSizeBQ = []() { return kPadK ? 1 : GetAlignmentBQ(); }();
 };
 
 template <typename ADataType_,
@@ -108,18 +118,19 @@ template <typename ADataType_,
           GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
           bool HasHotLoop_                 = true,
           TailNumber TailNum_              = TailNumber::Full>
-using GemmAQuantPipelineProblem = GemmAQuantPipelineProblemBase<ADataType_,
-                                                                AQDataType_,
-                                                                BDataType_,
-                                                                CDataType_,
-                                                                BlockGemmShape_,
-                                                                Traits_,
-                                                                QuantGroupSize_,
-                                                                TransposeC_,
-                                                                ComputeDataType_,
-                                                                Scheduler_,
-                                                                HasHotLoop_,
-                                                                TailNum_>;
+using GemmAQuantPipelineProblem = GemmQuantPipelineProblemBase<ADataType_,
+                                                               AQDataType_,
+                                                               BDataType_,
+                                                               void, // no BQDataType for AQuant
+                                                               CDataType_,
+                                                               BlockGemmShape_,
+                                                               Traits_,
+                                                               QuantGroupSize_,
+                                                               TransposeC_,
+                                                               ComputeDataType_,
+                                                               Scheduler_,
+                                                               HasHotLoop_,
+                                                               TailNum_>;
 
 template <typename ADataType_,
           typename BDataType_,
@@ -132,96 +143,43 @@ template <typename ADataType_,
           GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
           bool HasHotLoop_                 = true,
           TailNumber TailNum_              = TailNumber::Full>
-struct GemmBQuantPipelineProblemBase : public GemmPipelineProblemBase<ADataType_,
-                                                                      BDataType_,
-                                                                      CDataType_,
-                                                                      BlockGemmShape_,
-                                                                      Traits_,
-                                                                      ComputeDataType_>
-{
-    using Base = GemmPipelineProblemBase<ADataType_,
-                                         BDataType_,
-                                         CDataType_,
-                                         BlockGemmShape_,
-                                         Traits_,
-                                         ComputeDataType_>;
-
-    using Traits = typename Base::Traits;
-
-    using typename Base::ADataType;
-    using typename Base::BDataType;
-    using typename Base::CDataType;
-    using typename Base::ComputeDataType;
-    using BQDataType = remove_cvref_t<BQDataType_>;
-
-    using BlockGemmShape = typename Base::BlockGemmShape;
-
-    using typename Base::ALayout;
-    using typename Base::BLayout;
-    using typename Base::CLayout;
-
-    static constexpr bool TransposeC = Traits::TransposeC;
-
-    using Base::kBlockSize;
-
-    using Base::kPadK;
-    using Base::kPadM;
-    using Base::kPadN;
-
-    using Base::DoubleSmemBuffer;
-    using Base::VectorLoadSize;
-
-    using BQLayout = remove_cvref_t<typename Traits::BQLayout>;
-
-    static constexpr uint32_t kQuantGroupSize = QuantGroupSize_;
-    static constexpr auto Scheduler           = Scheduler_;
-    static constexpr auto HasHotLoop          = HasHotLoop_;
-    static constexpr auto TailNum             = TailNum_;
-
-    static_assert(BlockGemmShape::kK % kQuantGroupSize == 0);
-    static_assert(Scheduler == GemmPipelineScheduler::Intrawave);
-
-    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
-    {
-        // clang-format off
-        return concat('_', "gemm_bquant_problem",
-                      concat('x', VectorLoadSize, kBlockSize),
-                      concat('x', kPadM, kPadN, kPadK),
-                      Scheduler,
-                      "QuantGroupSize",
-                      kQuantGroupSize);
-        // clang-format on
-    }
-
-    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentBQ()
-    {
-        return VectorLoadSize / sizeof(BQDataType);
-    }
-
-    static constexpr index_t VectorSizeBQ = []() { return kPadK ? 1 : GetAlignmentBQ(); }();
-};
+using GemmBQuantPipelineProblem = GemmQuantPipelineProblemBase<ADataType_,
+                                                               void, // no AQDataType for BQuant
+                                                               BDataType_,
+                                                               BQDataType_,
+                                                               CDataType_,
+                                                               BlockGemmShape_,
+                                                               Traits_,
+                                                               QuantGroupSize_,
+                                                               false, // no TransposeC
+                                                               ComputeDataType_,
+                                                               Scheduler_,
+                                                               HasHotLoop_,
+                                                               TailNum_>;
 
 template <typename ADataType_,
           typename BDataType_,
-          typename BQDataType_,
           typename CDataType_,
+          typename AccDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          uint32_t QuantGroupSize_,
-          typename ComputeDataType_        = ADataType_,
+          bool TransposeC_                 = false,
+          typename ComputeDataType_        = BDataType_,
           GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
           bool HasHotLoop_                 = true,
           TailNumber TailNum_              = TailNumber::Full>
-using GemmBQuantPipelineProblem = GemmBQuantPipelineProblemBase<ADataType_,
-                                                                BDataType_,
-                                                                BQDataType_,
-                                                                CDataType_,
-                                                                BlockGemmShape_,
-                                                                Traits_,
-                                                                QuantGroupSize_,
-                                                                ComputeDataType_,
-                                                                Scheduler_,
-                                                                HasHotLoop_,
-                                                                TailNum_>;
-
+using GemmRowColTensorQuantPipelineProblem =
+    GemmQuantPipelineProblemBase<ADataType_,
+                                 AccDataType_,
+                                 BDataType_,
+                                 AccDataType_,
+                                 CDataType_,
+                                 BlockGemmShape_,
+                                 Traits_,
+                                 1, // no group size applicable
+                                 TransposeC_,
+                                 ComputeDataType_,
+                                 Scheduler_,
+                                 HasHotLoop_,
+                                 TailNum_>;
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_base_policy.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_base_policy.hpp
new file mode 100644
index 0000000000..19c1223b78
--- /dev/null
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_base_policy.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp"
+
+namespace ck_tile {
+
+struct GemmWPQuantPipelineAgBgCrPolicy : public UniversalWeightPreshufflePipelineAgBgCrPolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeBQ()
+    {
+        using BQDataType              = remove_cvref_t<typename Problem::BQDataType>;
+        constexpr index_t NPerBlock   = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
+        constexpr index_t KPerBlockBQ = KPerBlock / Problem::kQuantGroupSize;
+
+        return GetABQGlobalVectorLoadSize<Problem, BQDataType, NPerBlock, KPerBlockBQ>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBQDramTileDistribution()
+    {
+        return GemmBQuantPipelineAgBgCrDefaultPolicy::MakeBQDramTileDistribution<Problem>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockWeightPreshuffleBQuant()
+    {
+        using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
+
+        using BTypeToUse =
+            std::conditional_t<std::is_same_v<typename Problem::BDataType, ck_tile::pk_int4_t>,
+                               typename Problem::ADataType,
+                               typename Problem::BDataType>;
+
+        using WarpGemm = WarpGemmDispatcher<typename Problem::ADataType,
+                                            BTypeToUse,
+                                            typename Problem::CDataType,
+                                            WarpTile::at(I0),
+                                            WarpTile::at(I1),
+                                            WarpTile::at(I2),
+                                            Problem::TransposeC>;
+
+        // TODO : Use a custom block policy for AsBrCr
+        using BlockGemmPolicy =
+            BlockWeightPreshuffleASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                              typename Problem::BDataType,
+                                                              typename Problem::CDataType,
+                                                              BlockWarps,
+                                                              WarpGemm>;
+        return BlockGemmWeightPreshuffleBQuantARegBRegCReg<Problem, BlockGemmPolicy>{};
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
new file mode 100644
index 0000000000..01c1a72335
--- /dev/null
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
@@ -0,0 +1,471 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/host/concat.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename PipelinePolicy = GemmWPQuantPipelineAgBgCrPolicy>
+struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV2<Problem>
+{
+    using Base            = WeightPreshufflePipelineAGmemBGmemCRegV2<Problem>;
+    using ADataType       = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType       = remove_cvref_t<typename Problem::BDataType>;
+    using BQDataType      = remove_cvref_t<typename Problem::BQDataType>;
+    using CDataType       = remove_cvref_t<typename Problem::CDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using BlockGemmShape  = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    using ALayout  = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout  = remove_cvref_t<typename Problem::BLayout>;
+    using BQLayout = remove_cvref_t<typename Problem::BQLayout>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using BlockWeightPreshuffle = remove_cvref_t<
+        decltype(PipelinePolicy::template GetBlockWeightPreshuffleBQuant<Problem>())>;
+
+    static constexpr auto config =
+        BlockWeightPreshuffle::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+    using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+    using Base::kKPerBlock;
+    using Base::kMPerBlock;
+    using Base::kNPerBlock;
+
+    using Base::KIterPerWarp;
+    using Base::MIterPerWarp;
+    using Base::NIterPerWarp;
+
+    using Base::BlockSize;
+
+    using Base::kPadK;
+    using Base::kPadM;
+    using Base::kPadN;
+
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+
+    using Base::MWarp;
+    using Base::NWarp;
+
+    using Base::KPerBlockPerIter;
+    using Base::MPerBlockPerIter;
+
+    using Base::flatKPerWarp;
+    using Base::flatNPerWarp;
+
+    using Base::m_preload;
+
+    static constexpr index_t QuantGroupSize = Problem::kQuantGroupSize;
+    static constexpr index_t KPerBlockBQ    = BlockGemmShape::kK / QuantGroupSize;
+    static constexpr index_t QScalesPerBlockRow =
+        (kKPerBlock + QuantGroupSize - 1) / QuantGroupSize;
+
+    static constexpr index_t GetVectorSizeBQ()
+    {
+        return PipelinePolicy::template GetVectorSizeBQ<Problem>();
+    }
+    static constexpr index_t KIterPerQScale = KIterPerWarp / QScalesPerBlockRow;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0);
+        constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1);
+        return concat('_', "bquant_pipeline_AgBgCrV2_preshuffleB", 
+                      concat('x', kMPerBlock, kNPerBlock, kKPerBlock),
+                      BlockSize,
+                      concat('x', WaveNumM, WaveNumN),
+                      concat('x', Base::GetVectorSizeA(), Base::GetVectorSizeB(), GetVectorSizeBQ()),
+                      concat('x', kPadM, kPadN, kPadK), QuantGroupSize);
+        // clang-format on
+    }
+
+    static constexpr bool PreshuffleB = Problem::PreshuffleB;
+    static constexpr auto TailNum     = Problem::TailNum;
+
+    template <TailNumber TailNum,
+              typename ADramBlockWindowTmp,
+              typename BFlatBlockWindowTmp,
+              typename BQDramBlockWindowTmp,
+              typename AElementFunction,
+              index_t UnaryOpSize_ = 8>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   const BQDramBlockWindowTmp& bq_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem_ping,
+                                   void* p_smem_pong) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cvref_t<typename BFlatBlockWindowTmp::DataType>> &&
+                std::is_same_v<BQDataType, remove_cvref_t<typename BQDramBlockWindowTmp::DataType>>,
+            "A/B/BQ Dram block window should have the same data type as appropriate "
+            "([A|B|BQ]DataType) defined in Problem definition!");
+
+        constexpr bool is_a_col_major = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
+        static_assert(!is_a_col_major, "A must be row major (col major not supported yet)");
+
+        constexpr bool is_bq_col_major = std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>;
+        static_assert(is_bq_col_major, "Bq must be col major (row major not supported yet)");
+
+        constexpr bool is_b_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
+        static_assert(!is_b_row_major, "B must be col major (row major not supported yet)");
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // A tile in LDS
+        ADataType* p_a_lds_ping = static_cast<ADataType*>(p_smem_ping);
+        ADataType* p_a_lds_pong = static_cast<ADataType*>(p_smem_pong);
+
+        constexpr auto a_lds_block_desc =
+            PipelinePolicy::template MakeALdsBlockDescriptor<Problem>();
+
+        auto a_lds_block_ping =
+            make_tensor_view<address_space_enum::lds>(p_a_lds_ping, a_lds_block_desc);
+        auto a_lds_block_pong =
+            make_tensor_view<address_space_enum::lds>(p_a_lds_pong, a_lds_block_desc);
+
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        auto a_copy_lds_window_ping =
+            make_tile_window(a_lds_block_ping,
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             {0, 0},
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        auto a_copy_lds_window_pong =
+            make_tile_window(a_lds_block_pong,
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             {0, 0},
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        // ping-pong window for A LDS
+        auto a_warp_window_ping_tmp =
+            make_tile_window(a_lds_block_ping,
+                             make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+                             {iMWarp * WG::kM, 0},
+                             make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+
+        auto a_warp_window_pong_tmp =
+            make_tile_window(a_lds_block_pong,
+                             make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+                             {iMWarp * WG::kM, 0},
+                             make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_ping_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows_ping;
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_pong_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows_pong;
+
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
+
+                move_tile_window(a_warp_windows_ping(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
+
+                move_tile_window(a_warp_windows_pong(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        // Block GEMM
+        auto block_weight_preshuffle = BlockWeightPreshuffle();
+        // Acc register tile
+        auto c_block_tile = block_weight_preshuffle.MakeCBlockTile();
+
+        // B flat DRAM window for load
+        auto b_flat_distribution =
+            PipelinePolicy::template MakeBFlatDramTileDistribution<Problem>();
+        auto b_flat_dram_window = // tile_window_with_static_distribution
+            make_tile_window(
+                b_flat_dram_block_window_tmp.get_bottom_tensor_view(), // from kernel gemm_pad_views
+                make_tuple(number<flatNPerWarp>{}, number<flatKPerWarp>{}),
+                b_flat_dram_block_window_tmp.get_window_origin(),
+                b_flat_distribution);
+
+        using BTypeToUse =
+            std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
+        using BTileType = decltype(make_static_distributed_tensor<BTypeToUse>(b_flat_distribution));
+
+        // pingpong buffer for B
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_flat_dram_window), KIterPerWarp>,
+            NIterPerWarp>
+            b_flat_dram_windows;
+
+        statically_indexed_array<statically_indexed_array<BTileType, KIterPerWarp>, NIterPerWarp>
+            b_warp_tensor_ping;
+
+        statically_indexed_array<statically_indexed_array<BTileType, KIterPerWarp>, NIterPerWarp>
+            b_warp_tensor_pong;
+
+        // BQ DRAM window for load
+        auto bq_copy_dram_window =
+            make_tile_window(bq_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<kNPerBlock>{}, number<KPerBlockBQ>{}),
+                             bq_dram_block_window_tmp.get_window_origin(),
+                             PipelinePolicy::template MakeBQDramTileDistribution<Problem>());
+
+        // Prefetch A0
+        auto a_block_tile = load_tile(a_copy_dram_window);
+        // move A window to next k
+        move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+        // prefetch B
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});
+
+                load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                    b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
+            });
+        });
+        // move B window to next flat K
+        move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+        // Strictly not needed given type deduction, but helps with readability
+        using BQBlockTileDistr = decltype(bq_copy_dram_window.get_tile_distribution());
+        using BQBlockTile =
+            decltype(make_static_distributed_tensor<BQDataType>(BQBlockTileDistr{}));
+
+        // Load tile 0 for BQ data directly into registers for block tile
+        BQBlockTile bq_block_tile, bq_block_tile_2;
+        bq_block_tile = load_tile(bq_copy_dram_window);
+        // move BQ to tile 1
+        move_tile_window(bq_copy_dram_window, {0, KPerBlockBQ});
+
+        // Prefill A0
+        auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+        store_tile(a_copy_lds_window_ping, a_block_tile_tmp);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // Prefetch A1
+        a_block_tile = load_tile(a_copy_dram_window);
+        // move A window to next k
+        move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+        // initialize C
+        tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+        block_sync_lds();
+
+        // preload A00,A10 from lds
+        statically_indexed_array<decltype(load_tile(a_warp_windows_ping(number<0>{})(number<0>{}))),
+                                 m_preload>
+            a_warp_tensor;
+
+        static_for<0, m_preload, 1>{}([&](auto loadIter) {
+            constexpr auto mIter = loadIter % MIterPerWarp;
+            constexpr auto kIter = loadIter / MIterPerWarp;
+            a_warp_tensor(loadIter) =
+                load_tile(a_warp_windows_ping(number<mIter>{})(number<kIter>{}));
+        });
+        __builtin_amdgcn_sched_barrier(0);
+
+        // MAIN LOOP
+        index_t iCounter = (num_loop - 1) / 2;
+        while(iCounter > 0)
+        {
+            // prefetch B(2i+1)
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
+                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                        b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            bq_block_tile_2 = load_tile(bq_copy_dram_window);
+            move_tile_window(bq_copy_dram_window, {0, KPerBlockBQ});
+
+            // Prefill A(2i+1)
+            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window_pong, a_block_tile_tmp);
+
+            // Prefetch A(2i+2)
+            a_block_tile = load_tile(a_copy_dram_window);
+            // move A window to next k
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // GEMM 2i
+            block_weight_preshuffle(c_block_tile,
+                                    a_warp_tensor,
+                                    b_warp_tensor_ping,
+                                    bq_block_tile,
+                                    a_warp_windows_ping);
+
+            static_for<0, m_preload, 1>{}([&](auto loadIter) {
+                constexpr auto mIter = loadIter % MIterPerWarp;
+                constexpr auto kIter = loadIter / MIterPerWarp;
+                a_warp_tensor(loadIter) =
+                    load_tile(a_warp_windows_pong(number<mIter>{})(number<kIter>{}));
+            });
+            Base::HotLoopScheduler();
+
+            // Next K
+
+            // prefetch B(2i+2)
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
+                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                        b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            bq_block_tile = load_tile(bq_copy_dram_window);
+            move_tile_window(bq_copy_dram_window, {0, KPerBlockBQ});
+
+            // Prefill A(2i+2)
+            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window_ping, a_block_tile_tmp);
+
+            // Prefetch A(2i+3)
+            a_block_tile = load_tile(a_copy_dram_window);
+            // move A window to next k
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // GEMM 2i+1
+            block_weight_preshuffle(c_block_tile,
+                                    a_warp_tensor,
+                                    b_warp_tensor_pong,
+                                    bq_block_tile_2,
+                                    a_warp_windows_pong);
+
+            static_for<0, m_preload, 1>{}([&](auto loadIter) {
+                constexpr auto mIter = loadIter % MIterPerWarp;
+                constexpr auto kIter = loadIter / MIterPerWarp;
+                a_warp_tensor(loadIter) =
+                    load_tile(a_warp_windows_ping(number<mIter>{})(number<kIter>{}));
+            });
+            Base::HotLoopScheduler();
+
+            iCounter--;
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            // prefetch B(loopK)
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
+
+                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                        b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+            bq_block_tile_2 = load_tile(bq_copy_dram_window);
+
+            // Prefill A(loopK)
+            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window_pong, a_block_tile_tmp);
+
+            // GEMM loopK-1
+            block_weight_preshuffle(c_block_tile,
+                                    a_warp_tensor,
+                                    b_warp_tensor_ping,
+                                    bq_block_tile,
+                                    a_warp_windows_ping);
+
+            static_for<0, m_preload, 1>{}([&](auto loadIter) {
+                constexpr auto mIter = loadIter % MIterPerWarp;
+                constexpr auto kIter = loadIter / MIterPerWarp;
+                a_warp_tensor(loadIter) =
+                    load_tile(a_warp_windows_pong(number<mIter>{})(number<kIter>{}));
+            });
+
+            Base::Last2ndHotLoopScheduler();
+
+            // GEMM loopK
+            block_weight_preshuffle(c_block_tile,
+                                    a_warp_tensor,
+                                    b_warp_tensor_pong,
+                                    bq_block_tile_2,
+                                    a_warp_windows_pong);
+            Base::LastHotLoopScheduler();
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            // GEMM loopK
+            block_weight_preshuffle(c_block_tile,
+                                    a_warp_tensor,
+                                    b_warp_tensor_ping,
+                                    bq_block_tile,
+                                    a_warp_windows_ping);
+            Base::LastHotLoopScheduler();
+        }
+
+        return c_block_tile;
+    }
+
+    template <typename ADramBlockWindowTmp,
+              typename BFlatBlockWindowTmp,
+              typename BQDramBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   const BQDramBlockWindowTmp& bq_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem_ping,
+                                   void* p_smem_pong) const
+    {
+        return operator()<TailNum>(
+            a_dram_block_window_tmp,
+            [](const ADataType& a) { return a; },
+            b_flat_dram_block_window_tmp,
+            bq_dram_block_window_tmp,
+            num_loop,
+            p_smem_ping,
+            p_smem_pong);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp b/include/ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp
new file mode 100644
index 0000000000..52a326a897
--- /dev/null
+++ b/include/ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include <cstdint>
+
+namespace ck_tile {
+
+enum struct QuantType : std::uint16_t
+{
+    AQuantGrouped = 0,
+    BQuantGrouped = 1,
+    RowColQuant   = 2,
+    TensorQuant   = 3
+};
+
+inline std::string quant_type_to_string(QuantType quant_type)
+{
+    switch(quant_type)
+    {
+    case QuantType::AQuantGrouped: return "AQuantGrouped";
+    case QuantType::BQuantGrouped: return "BQuantGrouped";
+    case QuantType::RowColQuant: return "RowColQuant";
+    case QuantType::TensorQuant: return "TensorQuant";
+    default: return "Unknown";
+    }
+}
+
+template <bool kPadM_,
+          bool kPadN_,
+          bool kPadK_,
+          bool PreshuffleQuant_,
+          bool PreshuffleB_,
+          typename ALayout_,
+          typename BLayout_,
+          typename CLayout_,
+          QuantType QuantType_,
+          typename AQLayout_        = ALayout_,
+          typename BQLayout_        = BLayout_,
+          bool DoubleSmemBuffer_    = false,
+          bool UsePersistentKernel_ = false>
+struct TileGemmQuantTraits
+{
+    static constexpr bool kPadM = kPadM_;
+    static constexpr bool kPadN = kPadN_;
+    static constexpr bool kPadK = kPadK_;
+
+    static constexpr QuantType kQuantType = QuantType_;
+
+    static constexpr int _VectorSize       = 16;
+    static constexpr bool DoubleSmemBuffer = DoubleSmemBuffer_;
+
+    using ALayout  = ALayout_;
+    using BLayout  = BLayout_;
+    using CLayout  = CLayout_;
+    using AQLayout = AQLayout_;
+    using BQLayout = BQLayout_;
+
+    // TODO: It should be replaced to single value
+    using AsLayout = ALayout_;
+    using BsLayout = BLayout_;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+    static constexpr index_t NumWaveGroups      = 1;
+    static constexpr bool UsePersistentKernel   = UsePersistentKernel_;
+
+    static constexpr bool PreshuffleQuant = PreshuffleQuant_;
+    static constexpr bool PreshuffleB     = PreshuffleB_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/grouped_convolution.hpp b/include/ck_tile/ops/grouped_convolution.hpp
index 09b50f26b0..1dd13b6246 100644
--- a/include/ck_tile/ops/grouped_convolution.hpp
+++ b/include/ck_tile/ops/grouped_convolution.hpp
@@ -12,5 +12,7 @@
 #include "ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
index f4cc5d7138..e68a510a0c 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
@@ -24,7 +24,10 @@ struct GroupedConvBwdDataKernelArgs
 
     using ConvToGemmTransformer =
         TransformConvBwdDataToGemm<GroupedConvTraitsType_::NDimSpatial,
-                                   GroupedConvTraitsType_::ConvSpecialization>;
+                                   GroupedConvTraitsType_::ConvSpecialization,
+                                   GroupedConvTraitsType_::VectorSizeA,
+                                   GroupedConvTraitsType_::VectorSizeB,
+                                   GroupedConvTraitsType_::VectorSizeC>;
     static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
 
     static constexpr auto I0 = number<0>();
@@ -384,9 +387,8 @@ struct GroupedConvBwdDataKernelArgs
 
     static constexpr index_t MaxGroupedGemmGroupsNum = 128;
 
-    using ABCGridDescs =
-        remove_cvref_t<decltype(ConvToGemmTransformer{}
-                                    .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(1))>;
+    using ABCGridDescs = remove_cvref_t<
+        decltype(ConvToGemmTransformer{}.MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(1))>;
 
     using AGridDescMK = remove_cvref_t<decltype(ABCGridDescs{}[number<0>{}])>;
     using BGridDescNK = remove_cvref_t<decltype(ABCGridDescs{}[number<1>{}])>;
@@ -469,6 +471,10 @@ template <typename GroupedConvTraitsType_,
           typename EpiloguePipeline_>
 struct GroupedConvolutionBackwardDataKernel
 {
+    // Todo: Enable Vector Load Size > 1
+    static_assert(GroupedConvTraitsType_::VectorSizeA == 1 &&
+                  GroupedConvTraitsType_::VectorSizeB == 1);
+
     static constexpr index_t NDimSpatial = GroupedConvTraitsType_::NDimSpatial_;
     static constexpr ConvolutionSpecialization ConvSpecialization =
         GroupedConvTraitsType_::ConvSpecialization;
@@ -510,10 +516,13 @@ struct GroupedConvolutionBackwardDataKernel
 
     static_assert(GemmPipeline::kPadM && GemmPipeline::kPadN && GemmPipeline::kPadK,
                   "Not supported!");
-    static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>,
-                  "Not supported A GEMM layout!");
-    static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::ColumnMajor>,
-                  "Not supported B GEMM layout!");
+    static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>, "Not supported!");
+    static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::ColumnMajor>, "Not supported!");
+    // TODO: Change to and enable vector load
+    // static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>,
+    //               "Not supported A GEMM layout!");
+    // static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::RowMajor>,
+    //               "Not supported B GEMM layout!");
     static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>,
                   "Not supported C GEMM layout!");
 
@@ -530,7 +539,10 @@ struct GroupedConvolutionBackwardDataKernel
         return dim3(kargs.grid_size_, kargs.GemmBatch, kargs.k_batch);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? dim3(kBlockSize / 2) : dim3(kBlockSize);
+    }
 
     CK_TILE_HOST static constexpr GroupedConvBwdDataKernelArgsSpecialized
     MakeKernelArgs(const GroupedConvBwdDataHostArgs& hostArgs)
@@ -546,7 +558,7 @@ struct GroupedConvolutionBackwardDataKernel
     CK_TILE_HOST static bool
     IsSupportedArgument(const GroupedConvBwdDataKernelArgsSpecialized& kargs)
     {
-        if constexpr((EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+        if constexpr((GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
                       is_any_of<OutDataType, fp16_t, bf16_t>::value) ||
                      !IsSplitKSupported)
         {
@@ -623,7 +635,7 @@ struct GroupedConvolutionBackwardDataKernel
                      std::is_same_v<InLayout, ctc::NDHWGC>)
         {
             // Check access per C
-            if(ConvC % GemmPipeline::GetVectorSizeB() != 0)
+            if(ConvC % GroupedConvTraitsType_::VectorSizeB != 0)
             {
                 CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!");
                 return false;
@@ -635,13 +647,12 @@ struct GroupedConvolutionBackwardDataKernel
             return false;
         }
 
-        // check vector access of B
         // FIXME: layout
         if constexpr(std::is_same_v<WeiLayout, ctc::GKXC> ||
                      std::is_same_v<WeiLayout, ctc::GKYXC> ||
                      std::is_same_v<WeiLayout, ctc::GKZYXC>)
         {
-            if(ConvC % EpiloguePipeline::GetVectorSizeC() != 0)
+            if(ConvC % GroupedConvTraitsType_::VectorSizeC != 0)
             {
                 CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!");
                 return false;
@@ -653,12 +664,11 @@ struct GroupedConvolutionBackwardDataKernel
             return false;
         }
 
-        // check vector access of E
         if constexpr(std::is_same_v<OutLayout, ctc::NWGK> ||
                      std::is_same_v<OutLayout, ctc::NHWGK> ||
                      std::is_same_v<OutLayout, ctc::NDHWGK>)
         {
-            if(ConvK % GemmPipeline::GetVectorSizeA() != 0)
+            if(ConvK % GroupedConvTraitsType_::VectorSizeA != 0)
             {
                 CK_TILE_ERROR("Conv K is not a multiple of vector store size for output image!");
                 return false;
@@ -830,7 +840,7 @@ struct GroupedConvolutionBackwardDataKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(
+        const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(
             gemm_pad_views.at(I0).get_tensor_descriptor().get_length(I1)));
 
         // Run GEMM cooperatively by whole workgroup.
@@ -881,7 +891,7 @@ struct GroupedConvolutionBackwardDataKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+        const index_t num_loop = amd_wave_read_first_lane(
             TilePartitioner::GetLoopNum(gemm_tile_windows.at(I0).get_length(I1)));
 
         // Run GEMM cooperatively by whole workgroup.
@@ -926,7 +936,7 @@ struct GroupedConvolutionBackwardDataKernel
 
     CK_TILE_DEVICE void operator()(GroupedConvBwdDataKernelArgsSpecialized kargs) const
     {
-        const auto blockIdX    = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto blockIdX    = amd_wave_read_first_lane(blockIdx.x);
         const index_t group_id = FindGroupId(kargs, blockIdX);
 
         const auto [iM, iN] = OffsettedTile1DPartitioner<TilePartitioner>::GetOffsetedTileIndex(
@@ -934,13 +944,13 @@ struct GroupedConvolutionBackwardDataKernel
             kargs.c_grid_descs_m_n[group_id].get_length(I0),
             kargs.c_grid_descs_m_n[group_id].get_length(I1));
 
-        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
-        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
-        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
-        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+        const auto blockIdY       = amd_wave_read_first_lane(blockIdx.y);
+        const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
 
         // options
         // conv_bwd_data = Out * Weight = In
@@ -955,7 +965,7 @@ struct GroupedConvolutionBackwardDataKernel
         {
             __shared__ char smem_ptr_1[GetSmemSize()];
             if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
-                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
                            is_any_of<OutDataType, fp16_t, bf16_t>::value))
             {
                 RunGemm2LDS(a_ptr,
@@ -973,7 +983,7 @@ struct GroupedConvolutionBackwardDataKernel
         else
         {
             if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
-                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
                            is_any_of<OutDataType, fp16_t, bf16_t>::value))
             {
                 RunGemm(a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr_0, kargs, i_m, i_n, group_id);
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
index 2700353049..b85660aea3 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -23,7 +23,10 @@ struct GroupedConvBwdWeightKernelArgs
 
     using ConvToGemmTransformer =
         TransformConvBwdWeightToGemm<GroupedConvTraitsType_::NDimSpatial,
-                                     GroupedConvTraitsType_::ConvSpecialization>;
+                                     GroupedConvTraitsType_::ConvSpecialization,
+                                     GroupedConvTraitsType_::VectorSizeA,
+                                     GroupedConvTraitsType_::VectorSizeB,
+                                     GroupedConvTraitsType_::VectorSizeC>;
     static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
 
     template <
@@ -254,9 +257,8 @@ struct GroupedConvBwdWeightKernelArgs
         GemmBatch = args.G_;
     }
 
-    using ABCGridDescs =
-        remove_cvref_t<decltype(ConvToGemmTransformer{}
-                                    .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N())>;
+    using ABCGridDescs = remove_cvref_t<
+        decltype(ConvToGemmTransformer{}.MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N())>;
 
     using AGridDescMK = remove_cvref_t<decltype(ABCGridDescs{}[number<0>{}])>;
     using BGridDescNK = remove_cvref_t<decltype(ABCGridDescs{}[number<1>{}])>;
@@ -336,6 +338,10 @@ template <typename GroupedConvTraitsType_,
           typename EpiloguePipeline_>
 struct GroupedConvolutionBackwardWeightKernel
 {
+    // Todo: Enable Vector Load Size > 1
+    static_assert(GroupedConvTraitsType_::VectorSizeA == 1 &&
+                  GroupedConvTraitsType_::VectorSizeB == 1);
+
     static constexpr index_t NDimSpatial = GroupedConvTraitsType_::NDimSpatial_;
     static constexpr ConvolutionSpecialization ConvSpecialization =
         GroupedConvTraitsType_::ConvSpecialization;
@@ -356,11 +362,10 @@ struct GroupedConvolutionBackwardWeightKernel
 
     static constexpr index_t kBlockSize = GemmPipeline::BlockSize;
 
-    using InDataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
-    using WeiDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using OutDataType = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using InDataType  = remove_cvref_t<typename GemmPipeline::BDataType>;
     using DsDataType  = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
-    // Below type is actually accumulation data type - the output of block GEMM.
-    using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using WeiDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
     using GroupedConvBwdWeightKernelArgsSpecialized =
         GroupedConvBwdWeightKernelArgs<GroupedConvTraitsType_>;
@@ -377,6 +382,10 @@ struct GroupedConvolutionBackwardWeightKernel
                   "Not supported!");
     static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>, "Not supported!");
     static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::ColumnMajor>, "Not supported!");
+    // TODO: Change to and enable vector load
+    // static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::ColumnMajor>, "Not
+    // supported!"); static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::RowMajor>, "Not
+    // supported!");
     static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>, "Not supported!");
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
@@ -393,7 +402,10 @@ struct GroupedConvolutionBackwardWeightKernel
             TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN), kargs.GemmBatch, kargs.k_batch);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? dim3(kBlockSize / 2) : dim3(kBlockSize);
+    }
 
     CK_TILE_HOST static constexpr GroupedConvBwdWeightKernelArgsSpecialized
     MakeKernelArgs(const GroupedConvBwdWeightHostArgs& hostArgs)
@@ -411,22 +423,20 @@ struct GroupedConvolutionBackwardWeightKernel
         __device__ SplitKBatchOffset(const GroupedConvBwdWeightKernelArgsSpecialized& kargs,
                                      const std::size_t k_id = blockIdx.z)
         {
-            constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
-            const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
-            const index_t KRead =
-                __builtin_amdgcn_readfirstlane((kargs.GemmK + K_t - 1) / K_t * K1);
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t K_t   = amd_wave_read_first_lane(kargs.k_batch * K1);
+            const index_t KRead = amd_wave_read_first_lane((kargs.GemmK + K_t - 1) / K_t * K1);
 
-            a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
-            b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
+            a_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
+            b_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
 
             if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
             {
-                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
+                splitted_k = amd_wave_read_first_lane(KRead);
             }
             else
             {
-                splitted_k =
-                    __builtin_amdgcn_readfirstlane(kargs.GemmK - KRead * (kargs.k_batch - 1));
+                splitted_k = amd_wave_read_first_lane(kargs.GemmK - KRead * (kargs.k_batch - 1));
             }
         }
 
@@ -451,8 +461,8 @@ struct GroupedConvolutionBackwardWeightKernel
     CK_TILE_HOST static bool
     IsSupportedArgument(const GroupedConvBwdWeightKernelArgsSpecialized& kargs)
     {
-        if constexpr((EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                      is_any_of<OutDataType, fp16_t, bf16_t>::value) ||
+        if constexpr((GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
+                      is_any_of<WeiDataType, fp16_t, bf16_t>::value) ||
                      !IsSplitKSupported)
         {
             if(kargs.k_batch != 1)
@@ -523,7 +533,7 @@ struct GroupedConvolutionBackwardWeightKernel
                      std::is_same_v<InLayout, ctc::NDHWGC>)
         {
             // Check access per C
-            if(ConvC % GemmPipeline::GetVectorSizeB() != 0)
+            if(ConvC % GroupedConvTraitsType_::VectorSizeB != 0)
             {
                 CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!");
                 return false;
@@ -535,13 +545,11 @@ struct GroupedConvolutionBackwardWeightKernel
             return false;
         }
 
-        // check vector access of B
-        // FIXME: layout
         if constexpr(std::is_same_v<WeiLayout, ctc::GKXC> ||
                      std::is_same_v<WeiLayout, ctc::GKYXC> ||
                      std::is_same_v<WeiLayout, ctc::GKZYXC>)
         {
-            if(ConvC % EpiloguePipeline::GetVectorSizeC() != 0)
+            if(ConvC % GroupedConvTraitsType_::VectorSizeC != 0)
             {
                 CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!");
                 return false;
@@ -553,12 +561,11 @@ struct GroupedConvolutionBackwardWeightKernel
             return false;
         }
 
-        // check vector access of E
         if constexpr(std::is_same_v<OutLayout, ctc::NWGK> ||
                      std::is_same_v<OutLayout, ctc::NHWGK> ||
                      std::is_same_v<OutLayout, ctc::NDHWGK>)
         {
-            if(ConvK % GemmPipeline::GetVectorSizeA() != 0)
+            if(ConvK % GroupedConvTraitsType_::VectorSizeA != 0)
             {
                 CK_TILE_ERROR("Conv K is not a multiple of vector store size for output image!");
                 return false;
@@ -594,9 +601,8 @@ struct GroupedConvolutionBackwardWeightKernel
         }();
 
         const auto& c_tensor_view = [&]() {
-            return make_tensor_view<address_space_enum::global, DstInMemOp>(
-                c_ptr,
-                kargs.c_grid_desc_m_n); // B: in
+            return make_tensor_view<address_space_enum::global, DstInMemOp>(c_ptr,
+                                                                            kargs.c_grid_desc_m_n);
         }();
 
         const auto& ds_tensor_view = generate_tuple(
@@ -605,11 +611,11 @@ struct GroupedConvolutionBackwardWeightKernel
                               "Not supported!");
                 static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>,
                               "Not supported!");
-                static_assert(std::is_same_v<std::tuple_element_t<i, DsDataType>, OutDataType>,
+                static_assert(std::is_same_v<std::tuple_element_t<i, DsDataType>, WeiDataType>,
                               "Not supported!");
 
                 return make_tensor_view<address_space_enum::global>(
-                    static_cast<OutDataType*>(ds_ptr[i]), kargs.c_grid_desc_m_n);
+                    static_cast<WeiDataType*>(ds_ptr[i]), kargs.c_grid_desc_m_n);
             },
             number<NumDTensor>{});
 
@@ -797,22 +803,22 @@ struct GroupedConvolutionBackwardWeightKernel
 
     CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized kargs) const
     {
-        const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto blockIdX = amd_wave_read_first_lane(blockIdx.x);
         const auto [iM, iN] =
             TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
-        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
-        const auto blockIdZ    = __builtin_amdgcn_readfirstlane(blockIdx.z);
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+        const auto blockIdZ    = amd_wave_read_first_lane(blockIdx.z);
+        const index_t num_loop = amd_wave_read_first_lane(
             ck_tile::integer_divide_ceil(kargs.GemmK, kargs.k_batch * TilePartitioner::KPerBlock));
         const index_t i_k =
-            __builtin_amdgcn_readfirstlane(blockIdZ * num_loop * TilePartitioner::KPerBlock);
+            amd_wave_read_first_lane(blockIdZ * num_loop * TilePartitioner::KPerBlock);
 
-        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
-        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
-        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+        const auto blockIdY       = amd_wave_read_first_lane(blockIdx.y);
+        const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
 
         // options
         // conv_bwd_weight = Out * In = Weight
@@ -827,8 +833,8 @@ struct GroupedConvolutionBackwardWeightKernel
         {
             __shared__ char smem_ptr_1[GetSmemSize()];
             if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
-                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+                           GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
+                           is_any_of<WeiDataType, fp16_t, bf16_t>::value))
             {
                 RunGemm2LDS(a_ptr,
                             b_ptr,
@@ -846,8 +852,8 @@ struct GroupedConvolutionBackwardWeightKernel
         else
         {
             if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
-                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+                           GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
+                           is_any_of<WeiDataType, fp16_t, bf16_t>::value))
             {
                 RunGemm(
                     a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr_0, kargs, num_loop, i_m, i_n, i_k);
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
index d4f4eca0d0..0363782d33 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -23,7 +23,11 @@ struct GroupedConvFwdKernelArgs
 
     using ConvToGemmFwdTransformer =
         TransformConvFwdToGemm<GroupedConvTraitsType_::NDimSpatial,
-                               GroupedConvTraitsType_::ConvSpecialization>;
+                               GroupedConvTraitsType_::ConvSpecialization,
+                               GroupedConvTraitsType_::VectorSizeA,
+                               GroupedConvTraitsType_::VectorSizeB,
+                               GroupedConvTraitsType_::VectorSizeC,
+                               true>; // Split N enabled
     static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
 
     template <
@@ -56,7 +60,7 @@ struct GroupedConvFwdKernelArgs
 
         k_batch = args.k_batch;
 
-        GemmM     = args.N_ * args.output_spatial_lengths_[0];
+        // GemmM will be set after Split-N calculation
         GemmN     = args.K_;
         GemmK     = args.C_ * args.filter_spatial_lengths_[0];
         GemmBatch = args.G_;
@@ -94,6 +98,19 @@ struct GroupedConvFwdKernelArgs
                                          1,
                                          std::multiplies<index_t>());
         group_stride_c = args.K_;
+
+        // Initialize Split-N support fields for 1D convolution (NWGC layout)
+        // Get the actual split N from transformer
+        n_per_split = conv_to_gemm_transformer.GetN();
+        original_n  = conv_to_gemm_transformer.GetOriginalN();
+        n_splits    = ck_tile::integer_divide_ceil(original_n, n_per_split);
+
+        // Calculate batch strides for NWGC layout
+        input_batch_stride  = args.C_ * args.input_spatial_lengths_[0];
+        output_batch_stride = args.K_ * args.output_spatial_lengths_[0];
+
+        // Update GemmM to use split N (not original N)
+        GemmM = n_per_split * args.output_spatial_lengths_[0];
     }
 
     template <
@@ -133,7 +150,7 @@ struct GroupedConvFwdKernelArgs
 
         k_batch = args.k_batch;
 
-        GemmM     = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1];
+        // Note: GemmM will be set after Split-N calculation
         GemmN     = args.K_;
         GemmK     = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1];
         GemmBatch = args.G_;
@@ -171,6 +188,21 @@ struct GroupedConvFwdKernelArgs
                                          1,
                                          std::multiplies<index_t>());
         group_stride_c = args.K_;
+
+        // Initialize Split-N support fields for 2D convolution (NHWGC layout)
+        // Get the actual split N from transformer
+        n_per_split = conv_to_gemm_transformer.GetN();
+        original_n  = conv_to_gemm_transformer.GetOriginalN();
+        n_splits    = ck_tile::integer_divide_ceil(original_n, n_per_split);
+
+        // Calculate batch strides for NHWGC layout
+        input_batch_stride =
+            args.C_ * args.input_spatial_lengths_[0] * args.input_spatial_lengths_[1];
+        output_batch_stride =
+            args.K_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1];
+
+        // Update GemmM to use split N (not original N)
+        GemmM = n_per_split * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1];
     }
 
     template <
@@ -217,8 +249,7 @@ struct GroupedConvFwdKernelArgs
 
         k_batch = args.k_batch;
 
-        GemmM = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1] *
-                args.output_spatial_lengths_[2];
+        // Note: GemmM will be set after Split-N calculation
         GemmN = args.K_;
         GemmK = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1] *
                 args.filter_spatial_lengths_[2];
@@ -257,6 +288,22 @@ struct GroupedConvFwdKernelArgs
                                          1,
                                          std::multiplies<index_t>());
         group_stride_c = args.K_;
+
+        // Initialize Split-N support fields for 3D convolution (NDHWGC layout)
+        // Get the actual split N from transformer
+        n_per_split = conv_to_gemm_transformer.GetN();
+        original_n  = conv_to_gemm_transformer.GetOriginalN();
+        n_splits    = ck_tile::integer_divide_ceil(original_n, n_per_split);
+
+        // Calculate batch strides for NDHWGC layout
+        input_batch_stride = args.C_ * args.input_spatial_lengths_[0] *
+                             args.input_spatial_lengths_[1] * args.input_spatial_lengths_[2];
+        output_batch_stride = args.K_ * args.output_spatial_lengths_[0] *
+                              args.output_spatial_lengths_[1] * args.output_spatial_lengths_[2];
+
+        // Update GemmM to use split N (not original N)
+        GemmM = n_per_split * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1] *
+                args.output_spatial_lengths_[2];
     }
 
     using AGridDescMK = remove_cvref_t<
@@ -297,6 +344,13 @@ struct GroupedConvFwdKernelArgs
     long_index_t group_stride_a;
     long_index_t group_stride_b;
     long_index_t group_stride_c;
+
+    // Split-N support fields - initialize to safe defaults
+    index_t n_splits            = 1; // Number of batch splits (e.g., 2 for 128→64×2)
+    index_t n_per_split         = 1; // Batches per split (N_ from transformer)
+    index_t original_n          = 1; // Original batch size before splitting
+    index_t input_batch_stride  = 0; // Stride to next batch in input tensor
+    index_t output_batch_stride = 0; // Stride to next batch in output tensor
 };
 
 /// @brief The Grouped Convolution Forward kernel template.
@@ -392,13 +446,16 @@ struct GroupedConvolutionForwardKernel
         // clang-format on
     }
 
-    CK_TILE_HOST static constexpr auto GridSize(const GroupedConvFwdKernelArgsSpecialized& kargs)
+    CK_TILE_HOST static auto GridSize(const GroupedConvFwdKernelArgsSpecialized& kargs)
     {
         return dim3(
-            TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN), kargs.GemmBatch, kargs.k_batch);
+            TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN), kargs.GemmBatch, kargs.n_splits);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+    CK_TILE_HOST static auto BlockSize()
+    {
+        return is_wave32() ? dim3(kBlockSize / 2) : dim3(kBlockSize);
+    }
 
     CK_TILE_HOST static constexpr GroupedConvFwdKernelArgsSpecialized
     MakeKernelArgs(const GroupedConvFwdHostArgs& hostArgs)
@@ -413,7 +470,7 @@ struct GroupedConvolutionForwardKernel
 
     CK_TILE_HOST static bool IsSupportedArgument(const GroupedConvFwdKernelArgsSpecialized& kargs)
     {
-        if constexpr((EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+        if constexpr((GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
                       is_any_of<OutDataType, fp16_t, bf16_t>::value) ||
                      !IsSplitKSupported)
         {
@@ -427,6 +484,17 @@ struct GroupedConvolutionForwardKernel
             }
         }
 
+        // Check Split-K and Split-N conflict (both use blockIdx.z)
+        if(kargs.k_batch > 1 && kargs.n_splits > 1)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR(
+                    "Cannot use both Split-K and Split-N simultaneously (both use blockIdx.z)!");
+            }
+            return false;
+        }
+
         const index_t ConvK = kargs.wei_g_k_c_xs_lengths[number<1>{}];
         const index_t ConvC = kargs.wei_g_k_c_xs_lengths[number<2>{}];
 
@@ -485,7 +553,7 @@ struct GroupedConvolutionForwardKernel
                      std::is_same_v<InLayout, ctc::NDHWGC>)
         {
             // Check access per C
-            if(ConvC % GemmPipeline::GetVectorSizeA() != 0)
+            if(ConvC % GroupedConvTraitsType_::VectorSizeA != 0)
             {
                 CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!");
                 return false;
@@ -503,7 +571,7 @@ struct GroupedConvolutionForwardKernel
                      std::is_same_v<WeiLayout, ctc::GKYXC> ||
                      std::is_same_v<WeiLayout, ctc::GKZYXC>)
         {
-            if(ConvC % GemmPipeline::GetVectorSizeB() != 0)
+            if(ConvC % GroupedConvTraitsType_::VectorSizeB != 0)
             {
                 CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!");
                 return false;
@@ -520,7 +588,7 @@ struct GroupedConvolutionForwardKernel
                      std::is_same_v<OutLayout, ctc::NHWGK> ||
                      std::is_same_v<OutLayout, ctc::NDHWGK>)
         {
-            if(ConvK % EpiloguePipeline::GetVectorSizeC() != 0)
+            if(ConvK % GroupedConvTraitsType_::VectorSizeC != 0)
             {
                 CK_TILE_ERROR("Conv K is not a multiple of vector store size for output image!");
                 return false;
@@ -684,8 +752,7 @@ struct GroupedConvolutionForwardKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop =
-            __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
+        const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
@@ -734,8 +801,7 @@ struct GroupedConvolutionForwardKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop =
-            __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
+        const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
@@ -754,21 +820,37 @@ struct GroupedConvolutionForwardKernel
 
     CK_TILE_DEVICE void operator()(GroupedConvFwdKernelArgsSpecialized kargs) const
     {
-        const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto blockIdX = amd_wave_read_first_lane(blockIdx.x);
         const auto [iM, iN] =
             TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
-        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
-        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
-        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
-        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+        const auto blockIdY       = amd_wave_read_first_lane(blockIdx.y);
+        const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
 
-        // options
-        const InDataType* a_ptr  = static_cast<const InDataType*>(kargs.in_ptr) + group_offset_a;
-        const WeiDataType* b_ptr = static_cast<const WeiDataType*>(kargs.wei_ptr) + group_offset_b;
-        OutDataType* c_ptr       = static_cast<OutDataType*>(kargs.out_ptr) + group_offset_c;
+        // Split-N handling: Get which split this workgroup handles
+        const auto blockIdZ = amd_wave_read_first_lane(blockIdx.z);
+
+        // Calculate batch offset for this split
+        const index_t batch_offset = amd_wave_read_first_lane(blockIdZ * kargs.n_per_split);
+
+        // Calculate memory offsets for this split
+        const long_index_t input_batch_offset = static_cast<long_index_t>(batch_offset) *
+                                                static_cast<long_index_t>(kargs.input_batch_stride);
+        const long_index_t output_batch_offset =
+            static_cast<long_index_t>(batch_offset) *
+            static_cast<long_index_t>(kargs.output_batch_stride);
+
+        // Adjust pointers: combine group offset and batch offset
+        const InDataType* a_ptr =
+            static_cast<const InDataType*>(kargs.in_ptr) + group_offset_a + input_batch_offset;
+        const WeiDataType* b_ptr = static_cast<const WeiDataType*>(kargs.wei_ptr) +
+                                   group_offset_b; // No batch offset for weights!
+        OutDataType* c_ptr =
+            static_cast<OutDataType*>(kargs.out_ptr) + group_offset_c + output_batch_offset;
 
         // allocate LDS
         __shared__ char smem_ptr_0[GetSmemSize()];
@@ -777,7 +859,7 @@ struct GroupedConvolutionForwardKernel
         {
             __shared__ char smem_ptr_1[GetSmemSize()];
             if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
-                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
                            is_any_of<OutDataType, fp16_t, bf16_t>::value))
             {
                 RunGemm2LDS(
@@ -787,7 +869,7 @@ struct GroupedConvolutionForwardKernel
         else
         {
             if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
-                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
                            is_any_of<OutDataType, fp16_t, bf16_t>::value))
             {
                 RunGemm(a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr_0, kargs, i_m, i_n);
diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
index 3e5e87a975..c745aee622 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -49,7 +49,10 @@ template <index_t NDimSpatial_,
           typename InLayout_,
           typename WeiLayout_,
           typename DsLayout_,
-          typename OutLayout_>
+          typename OutLayout_,
+          index_t VectorSizeA_ = 1,
+          index_t VectorSizeB_ = 1,
+          index_t VectorSizeC_ = 1>
 struct GroupedConvTraits
 {
     private:
@@ -67,14 +70,38 @@ struct GroupedConvTraits
     using WeiLayout                                               = WeiLayout_;
     using DsLayout                                                = DsLayout_;
     using OutLayout                                               = OutLayout_;
-    using GroupedConvImplicitGemmTraits                           = TileGemmTraits<true,
-                                                                                   true,
-                                                                                   true,
-                                                                                   ck_tile::tensor_layout::gemm::RowMajor,
-                                                                                   ck_tile::tensor_layout::gemm::ColumnMajor,
-                                                                                   ck_tile::tensor_layout::gemm::RowMajor>;
-    static constexpr index_t NumDTensor                           = DsLayout::size();
-    using ImplicitGemmDsLayout = decltype(generate_implicit_gemm_layout());
+    using GroupedConvImplicitGemmTraitsFwd =
+        TileGemmTraits<true,
+                       true,
+                       true,
+                       ck_tile::tensor_layout::gemm::RowMajor,
+                       ck_tile::tensor_layout::gemm::ColumnMajor,
+                       ck_tile::tensor_layout::gemm::RowMajor>;
+    using GroupedConvImplicitGemmTraitsBwdData =
+        TileGemmTraits<true,
+                       true,
+                       true,
+                       ck_tile::tensor_layout::gemm::RowMajor,
+                       ck_tile::tensor_layout::gemm::ColumnMajor,
+                       // TODO: Change to and enable vector load
+                       //    ck_tile::tensor_layout::gemm::RowMajor,
+                       //    ck_tile::tensor_layout::gemm::RowMajor,
+                       ck_tile::tensor_layout::gemm::RowMajor>;
+    using GroupedConvImplicitGemmTraitsBwdWeight =
+        TileGemmTraits<true,
+                       true,
+                       true,
+                       ck_tile::tensor_layout::gemm::RowMajor,
+                       ck_tile::tensor_layout::gemm::ColumnMajor,
+                       // TODO: Change to and enable vector load
+                       //    ck_tile::tensor_layout::gemm::ColumnMajor,
+                       //    ck_tile::tensor_layout::gemm::RowMajor,
+                       ck_tile::tensor_layout::gemm::RowMajor>;
+    static constexpr ck_tile::index_t VectorSizeA = VectorSizeA_;
+    static constexpr ck_tile::index_t VectorSizeB = VectorSizeB_;
+    static constexpr ck_tile::index_t VectorSizeC = VectorSizeC_;
+    static constexpr index_t NumDTensor           = DsLayout::size();
+    using ImplicitGemmDsLayout                    = decltype(generate_implicit_gemm_layout());
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp
index 972d05ff3e..c68a0a1400 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp
@@ -10,6 +10,9 @@ namespace ck_tile {
 
 template <index_t NDimSpatial,
           ConvolutionSpecialization ConvolutionSpecialization,
+          index_t VectorSizeA,
+          index_t VectorSizeB,
+          index_t VectorSizeC,
           bool SplitN              = false,
           typename ADataType       = float,
           typename CDataType       = float,
@@ -442,14 +445,17 @@ struct TransformConvBwdDataToGemm
         // TODO Add support for NumGroupsToMerge > 1
 
         return make_naive_tensor_descriptor(make_tuple(N_, Wo_, K_),
-                                            make_tuple(NStride, WoStride, KStride));
+                                            make_tuple(NStride, WoStride, KStride),
+                                            number<VectorSizeA>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
     CK_TILE_HOST auto make_wei_grid_desc() const
     {
         // GKXC
-        return make_naive_tensor_descriptor_packed(make_tuple(K_, X_, C_));
+        return make_naive_tensor_descriptor(
+            make_tuple(K_, X_, C_), make_tuple(X_ * C_, C_, I1), number<VectorSizeB>{}, I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
@@ -462,7 +468,9 @@ struct TransformConvBwdDataToGemm
 
         // TODO Add support for NumGroupsToMerge > 1
         return make_naive_tensor_descriptor(make_tuple(N_, Wi_, C_),
-                                            make_tuple(NStride, WiStride, CStride));
+                                            make_tuple(NStride, WiStride, CStride),
+                                            number<VectorSizeC>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
@@ -477,7 +485,9 @@ struct TransformConvBwdDataToGemm
         // TODO Add support for NumGroupsToMerge > 1
 
         return make_naive_tensor_descriptor(make_tuple(N_, Ho_, Wo_, K_),
-                                            make_tuple(NStride, HoStride, WoStride, KStride));
+                                            make_tuple(NStride, HoStride, WoStride, KStride),
+                                            number<VectorSizeA>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
@@ -491,14 +501,19 @@ struct TransformConvBwdDataToGemm
 
         // TODO Add support for NumGroupsToMerge > 1
         return make_naive_tensor_descriptor(make_tuple(N_, Hi_, Wi_, C_),
-                                            make_tuple(NStride, HiStride, WiStride, CStride));
+                                            make_tuple(NStride, HiStride, WiStride, CStride),
+                                            number<VectorSizeB>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
     CK_TILE_HOST auto make_wei_grid_desc() const
     {
         // GKYXC
-        return make_naive_tensor_descriptor_packed(make_tuple(K_, Y_, X_, C_));
+        return make_naive_tensor_descriptor(make_tuple(K_, Y_, X_, C_),
+                                            make_tuple(C_ * X_ * Y_, C_ * X_, C_, I1),
+                                            number<VectorSizeC>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
@@ -514,7 +529,9 @@ struct TransformConvBwdDataToGemm
         // TODO Add support for NumGroupsToMerge > 1
         return make_naive_tensor_descriptor(
             make_tuple(N_, Do_, Ho_, Wo_, K_),
-            make_tuple(NStride, DoStride, HoStride, WoStride, KStride));
+            make_tuple(NStride, DoStride, HoStride, WoStride, KStride),
+            number<VectorSizeA>{},
+            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
@@ -529,14 +546,20 @@ struct TransformConvBwdDataToGemm
         // TODO Add support for NumGroupsToMerge > 1
         return make_naive_tensor_descriptor(
             make_tuple(N_, Di_, Hi_, Wi_, C_),
-            make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
+            make_tuple(NStride, DiStride, HiStride, WiStride, CStride),
+            number<VectorSizeB>{},
+            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
     CK_TILE_HOST auto make_wei_grid_desc() const
     {
         // GKZYXC
-        return make_naive_tensor_descriptor_packed(make_tuple(K_, Z_, Y_, X_, C_));
+        return make_naive_tensor_descriptor(
+            make_tuple(K_, Z_, Y_, X_, C_),
+            make_tuple(C_ * X_ * Y_ * Z_, C_ * X_ * Y_, C_ * X_, C_, I1),
+            number<VectorSizeC>{},
+            I1);
     }
     // TODO: implement ck_tile::tensor_layout::convolution that describe packed/strided dimemsion as
     // properties
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp
index b2b7918810..010a8ac949 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp
@@ -10,6 +10,9 @@ namespace ck_tile {
 
 template <index_t NDimSpatial,
           ConvolutionSpecialization ConvolutionSpecialization,
+          index_t VectorSizeA,
+          index_t VectorSizeB,
+          index_t VectorSizeC,
           bool SplitN              = false,
           typename ADataType       = float,
           typename CDataType       = float,
@@ -420,7 +423,9 @@ struct TransformConvBwdWeightToGemm
         // TODO Add support for NumGroupsToMerge > 1
 
         return make_naive_tensor_descriptor(make_tuple(K_, N_ * Wo_),
-                                            make_tuple(KStride, NDoHoWoStride));
+                                            make_tuple(KStride, NDoHoWoStride),
+                                            number<VectorSizeA>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
@@ -433,7 +438,9 @@ struct TransformConvBwdWeightToGemm
 
         // TODO Add support for NumGroupsToMerge > 1
         return make_naive_tensor_descriptor(make_tuple(N_, Wi_, C_),
-                                            make_tuple(NStride, WiStride, CStride));
+                                            make_tuple(NStride, WiStride, CStride),
+                                            number<VectorSizeB>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
@@ -444,7 +451,8 @@ struct TransformConvBwdWeightToGemm
         constexpr auto CXStride = I1;
 
         // TODO Add support for NumGroupsToMerge > 1
-        return make_naive_tensor_descriptor(make_tuple(K_, X_ * C_), make_tuple(KStride, CXStride));
+        return make_naive_tensor_descriptor(
+            make_tuple(K_, X_ * C_), make_tuple(KStride, CXStride), number<VectorSizeC>{}, I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
@@ -457,7 +465,9 @@ struct TransformConvBwdWeightToGemm
         // TODO Add support for NumGroupsToMerge > 1
 
         return make_naive_tensor_descriptor(make_tuple(K_, N_ * Ho_ * Wo_),
-                                            make_tuple(KStride, NDoHoWoStride));
+                                            make_tuple(KStride, NDoHoWoStride),
+                                            number<VectorSizeA>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
@@ -471,7 +481,9 @@ struct TransformConvBwdWeightToGemm
 
         // TODO Add support for NumGroupsToMerge > 1
         return make_naive_tensor_descriptor(make_tuple(N_, Hi_, Wi_, C_),
-                                            make_tuple(NStride, HiStride, WiStride, CStride));
+                                            make_tuple(NStride, HiStride, WiStride, CStride),
+                                            number<VectorSizeB>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
@@ -482,8 +494,8 @@ struct TransformConvBwdWeightToGemm
         constexpr auto CStride = I1;
 
         // TODO Add support for NumGroupsToMerge > 1
-        return make_naive_tensor_descriptor(make_tuple(K_, Y_ * X_ * C_),
-                                            make_tuple(KStride, CStride));
+        return make_naive_tensor_descriptor(
+            make_tuple(K_, Y_ * X_ * C_), make_tuple(KStride, CStride), number<VectorSizeC>{}, I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
@@ -496,7 +508,9 @@ struct TransformConvBwdWeightToGemm
         // TODO Add support for NumGroupsToMerge > 1
 
         return make_naive_tensor_descriptor(make_tuple(K_, N_ * Do_ * Ho_ * Wo_),
-                                            make_tuple(KStride, NDoHoWoStride));
+                                            make_tuple(KStride, NDoHoWoStride),
+                                            number<VectorSizeA>{},
+                                            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
@@ -511,7 +525,9 @@ struct TransformConvBwdWeightToGemm
         // TODO Add support for NumGroupsToMerge > 1
         return make_naive_tensor_descriptor(
             make_tuple(N_, Di_, Hi_, Wi_, C_),
-            make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
+            make_tuple(NStride, DiStride, HiStride, WiStride, CStride),
+            number<VectorSizeB>{},
+            I1);
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
@@ -523,7 +539,9 @@ struct TransformConvBwdWeightToGemm
 
         // TODO Add support for NumGroupsToMerge > 1
         return make_naive_tensor_descriptor(make_tuple(K_, Z_ * Y_ * X_ * C_),
-                                            make_tuple(KStride, CStride));
+                                            make_tuple(KStride, CStride),
+                                            number<VectorSizeC>{},
+                                            I1);
     }
 
     // TODO: implement ck_tile::tensor_layout::convolution that describe packed/strided dimemsion as
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
index c468ae4398..9e84973a89 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
@@ -10,6 +10,9 @@ namespace ck_tile {
 
 template <index_t NDimSpatial,
           ConvolutionSpecialization ConvSpecialization,
+          index_t VectorSizeA,
+          index_t VectorSizeB,
+          index_t VectorSizeC,
           bool SplitN              = false,
           typename ADataType       = float,
           typename CDataType       = float,
@@ -24,7 +27,7 @@ struct TransformConvFwdToGemm
     static constexpr auto I3 = number<3>{};
     static constexpr auto I4 = number<4>{};
     static constexpr auto I5 = number<5>{};
-#if 0 // TODO: Enable these functionalities
+
     template <typename ConvDimsType>
     static long_index_t calculate_element_space_size_impl(const ConvDimsType& lengths,
                                                           const ConvDimsType& strides,
@@ -42,24 +45,40 @@ struct TransformConvFwdToGemm
 
     template <typename ConvDimsType>
     static IndexType GetSplitedNSize(const ConvDimsType& a_g_n_c_wis_lengths,
-                                     const ConvDimsType& a_g_n_c_wis_strides,
-                                     const ConvDimsType& c_g_n_k_wos_lengths,
-                                     const ConvDimsType& c_g_n_k_wos_strides)
+                                     const ConvDimsType& c_g_n_k_wos_lengths)
     {
+        // Calculate strides internally assuming contiguous memory layout
+        ConvDimsType a_g_n_c_wis_strides, c_g_n_k_wos_strides;
+        const index_t num_dims = a_g_n_c_wis_lengths.size();
+
+        // Calculate strides for input tensor (innermost to outermost)
+        a_g_n_c_wis_strides[num_dims - 1] = 1;
+        for(index_t i = num_dims - 2; i >= 0; i--)
+        {
+            a_g_n_c_wis_strides[i] = a_g_n_c_wis_strides[i + 1] * a_g_n_c_wis_lengths[i + 1];
+        }
+
+        // Calculate strides for output tensor
+        c_g_n_k_wos_strides[num_dims - 1] = 1;
+        for(index_t i = num_dims - 2; i >= 0; i--)
+        {
+            c_g_n_k_wos_strides[i] = c_g_n_k_wos_strides[i + 1] * c_g_n_k_wos_lengths[i + 1];
+        }
+
         const long_index_t a_element_space_size =
             calculate_element_space_size_impl(a_g_n_c_wis_lengths, a_g_n_c_wis_strides, I1);
         const long_index_t c_element_space_size =
             calculate_element_space_size_impl(c_g_n_k_wos_lengths, c_g_n_k_wos_strides, I1);
-        const long_index_t element_space_size = math::max(a_element_space_size * sizeof(ADataType),
-                                                          c_element_space_size * sizeof(CDataType));
-        constexpr long_index_t TwoGB          = (long_index_t{1} << 31);
+        const long_index_t element_space_size = ck_tile::max(
+            a_element_space_size * sizeof(ADataType), c_element_space_size * sizeof(CDataType));
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31); // 2GB
 
         const IndexType N = a_g_n_c_wis_lengths[I1];
 
         if(element_space_size > TwoGB)
         {
             // Minimum divisor of N to not exceed 2GB
-            const auto divisor = math::integer_divide_ceil(element_space_size, TwoGB);
+            const auto divisor = ck_tile::integer_divide_ceil(element_space_size, TwoGB);
 
             if(divisor <= static_cast<double>(N))
             {
@@ -70,7 +89,8 @@ struct TransformConvFwdToGemm
                 {
                     if(N % least_divisor == 0)
                     {
-                        return N / least_divisor;
+                        IndexType result = N / least_divisor;
+                        return result;
                     }
                 }
                 // Not found, process one Convolution N per block
@@ -90,9 +110,12 @@ struct TransformConvFwdToGemm
             return N;
         }
     }
-#endif
 
     public:
+    // Public getter methods for Split-N support
+    CK_TILE_HOST constexpr IndexType GetN() const { return N_; }
+    CK_TILE_HOST constexpr IndexType GetOriginalN() const { return original_N_; }
+
     CK_TILE_HOST constexpr TransformConvFwdToGemm() {}
 
     template <typename TransformConvFwdToGemmBase>
@@ -100,6 +123,7 @@ struct TransformConvFwdToGemm
     TransformConvFwdToGemm(const TransformConvFwdToGemmBase& transform_conv_fwd_to_gemm_base)
         : G_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.G_)},
           N_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.N_)},
+          original_N_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.original_N_)},
           Di_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Di_)},
           Hi_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Hi_)},
           Wi_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Wi_)},
@@ -168,18 +192,14 @@ struct TransformConvFwdToGemm
                       std::is_same_v<ConvSpatialDimsType, ck_tile::array<IndexType, NDimSpatial>>);
         static_assert(std::is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
                       std::is_same_v<ConvDimsType, ck_tile::array<IndexType, NDimSpatial + I3>>);
-#if 0 // TODO: Enable these functionalities
         if constexpr(SplitN)
         {
-            N_ = GetSplitedNSize(
-                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+            N_ = GetSplitedNSize(a_g_n_c_wis_lengths, c_g_n_k_wos_lengths);
         }
         else
         {
             N_ = c_g_n_k_wos_lengths[I1];
         }
-#endif
-        N_ = c_g_n_k_wos_lengths[I1];
     }
 
     template <typename ConvDimsType,
@@ -223,18 +243,19 @@ struct TransformConvFwdToGemm
                       std::is_same_v<ConvSpatialDimsType, ck_tile::array<IndexType, NDimSpatial>>);
         static_assert(std::is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
                       std::is_same_v<ConvDimsType, ck_tile::array<IndexType, NDimSpatial + I3>>);
-#if 0 // TODO: Enable these functionalities
+
+        // Store original N
+        original_N_ = c_g_n_k_wos_lengths[I1];
+
         if constexpr(SplitN)
         {
-            N_ = GetSplitedNSize(
-                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+            N_ = GetSplitedNSize(a_g_n_c_wis_lengths, c_g_n_k_wos_lengths);
         }
         else
         {
-            N_ = c_g_n_k_wos_lengths[I1];
+            N_          = c_g_n_k_wos_lengths[I1];
+            original_N_ = N_;
         }
-#endif
-        N_ = c_g_n_k_wos_lengths[I1];
     }
 
     template <typename ConvDimsType,
@@ -278,18 +299,18 @@ struct TransformConvFwdToGemm
                       std::is_same_v<ConvSpatialDimsType, ck_tile::array<IndexType, NDimSpatial>>);
         static_assert(std::is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
                       std::is_same_v<ConvDimsType, ck_tile::array<IndexType, NDimSpatial + I3>>);
-#if 0 // TODO: Enable these functionalities
+
+        // Store original N before potential splitting
+        original_N_ = c_g_n_k_wos_lengths[I1];
+
         if constexpr(SplitN)
         {
-            N_ = GetSplitedNSize(
-                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+            N_ = GetSplitedNSize(a_g_n_c_wis_lengths, c_g_n_k_wos_lengths);
         }
         else
         {
-            N_ = c_g_n_k_wos_lengths[I1];
+            N_ = original_N_;
         }
-#endif
-        N_ = c_g_n_k_wos_lengths[I1];
     }
 
 #if 0 // TODO: Enable these functionalities
@@ -428,7 +449,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Wo_, C_),
-                    make_tuple(NStrideTensorA_, WiStride_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, WiStride_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
                 return transform_tensor_descriptor(
                     in_gemmm_gemmk_desc,
                     make_tuple(make_merge_transform(make_tuple(N_, Wo_)),
@@ -440,7 +463,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_gemmm_groups_gemmk_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Wo_, NumGroupsToMerge, C_),
-                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 return transform_tensor_descriptor(
                     in_gemmm_groups_gemmk_desc,
@@ -455,8 +480,11 @@ struct TransformConvFwdToGemm
             if constexpr(NumGroupsToMerge == 1)
             {
 
-                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
-                    make_tuple(N_, Wi_), make_tuple(NStrideTensorA_, WiStride_));
+                const auto in_n_wi_c_desc =
+                    make_naive_tensor_descriptor(make_tuple(N_, Wi_),
+                                                 make_tuple(NStrideTensorA_, WiStride_),
+                                                 number<VectorSizeA>{},
+                                                 I1);
 
                 const auto in_n_wip_c_desc = transform_tensor_descriptor(
                     in_n_wi_c_desc,
@@ -484,7 +512,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Wi_, NumGroupsToMerge),
-                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_));
+                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_wip_c_desc = transform_tensor_descriptor(
                     in_n_wi_c_desc,
@@ -517,7 +547,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Wi_, C_),
-                    make_tuple(NStrideTensorA_, WiStride_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, WiStride_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_wo_c_desc = transform_tensor_descriptor(
                     in_n_wi_c_desc,
@@ -538,7 +570,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Wi_, NumGroupsToMerge, C_),
-                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_wo_c_desc = transform_tensor_descriptor(
                     in_n_wi_c_desc,
@@ -563,7 +597,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Wi_, C_),
-                    make_tuple(NStrideTensorA_, WiStride_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, WiStride_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_wip_c_desc = transform_tensor_descriptor(
                     in_n_wi_c_desc,
@@ -593,7 +629,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Wi_, NumGroupsToMerge, C_),
-                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_wip_c_desc = transform_tensor_descriptor(
                     in_n_wi_c_desc,
@@ -643,7 +681,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Ho_, Wo_, C_),
-                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 return transform_tensor_descriptor(
                     in_gemmm_gemmk_desc,
@@ -657,7 +697,9 @@ struct TransformConvFwdToGemm
                 const auto in_gemmm_groups_gemmk_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Ho_, Wo_, NumGroupsToMerge, C_),
                     make_tuple(
-                        NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+                        NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 return transform_tensor_descriptor(
                     in_gemmm_groups_gemmk_desc,
@@ -671,8 +713,11 @@ struct TransformConvFwdToGemm
         {
             if constexpr(NumGroupsToMerge == 1)
             {
-                const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
-                    make_tuple(N_, Hi_, Wi_), make_tuple(NStrideTensorA_, HiStride_, WiStride_));
+                const auto in_n_hi_wi_c_desc =
+                    make_naive_tensor_descriptor(make_tuple(N_, Hi_, Wi_),
+                                                 make_tuple(NStrideTensorA_, HiStride_, WiStride_),
+                                                 number<VectorSizeA>{},
+                                                 I1);
 
                 const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
                     in_n_hi_wi_c_desc,
@@ -703,7 +748,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_hi_wi_groups_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Hi_, Wi_, NumGroupsToMerge),
-                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_));
+                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_hip_wip_groups_c_desc = transform_tensor_descriptor(
                     in_n_hi_wi_groups_c_desc,
@@ -739,7 +786,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Hi_, Wi_, C_),
-                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_ho_wo_c_desc = transform_tensor_descriptor(
                     in_n_hi_wi_c_desc,
@@ -762,7 +811,9 @@ struct TransformConvFwdToGemm
                 const auto in_n_hi_wi_groups_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Hi_, Wi_, NumGroupsToMerge, C_),
                     make_tuple(
-                        NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+                        NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_ho_wo_groups_c_desc = transform_tensor_descriptor(
                     in_n_hi_wi_groups_c_desc,
@@ -790,7 +841,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Hi_, Wi_, C_),
-                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
                     in_n_hi_wi_c_desc,
@@ -825,7 +878,9 @@ struct TransformConvFwdToGemm
                 const auto in_n_hi_wi_groups_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Hi_, Wi_, NumGroupsToMerge, C_),
                     make_tuple(
-                        NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+                        NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_hip_wip_groups_c_desc = transform_tensor_descriptor(
                     in_n_hi_wi_groups_c_desc,
@@ -886,7 +941,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Do_, Ho_, Wo_, C_),
-                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 return transform_tensor_descriptor(
                     in_gemmm_gemmk_desc,
@@ -904,7 +961,9 @@ struct TransformConvFwdToGemm
                                HiStride_,
                                WiStride_,
                                GStrideTensorA_,
-                               CStrideTensorA_));
+                               CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 return transform_tensor_descriptor(
                     in_gemmm_groups_gemmk_desc,
@@ -921,7 +980,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Di_, Hi_, Wi_),
-                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_));
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
                     in_n_di_hi_wi_c_desc,
@@ -957,7 +1018,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Di_, Hi_, Wi_, NumGroupsToMerge),
-                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, GStrideTensorA_));
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, GStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
                     in_n_di_hi_wi_c_desc,
@@ -1004,7 +1067,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Di_, Hi_, Wi_, C_),
-                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor(
                     in_n_di_hi_wi_c_desc,
@@ -1034,7 +1099,9 @@ struct TransformConvFwdToGemm
                                HiStride_,
                                WiStride_,
                                GStrideTensorA_,
-                               CStrideTensorA_));
+                               CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor(
                     in_n_di_hi_wi_c_desc,
@@ -1072,7 +1139,9 @@ struct TransformConvFwdToGemm
             {
                 const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
                     make_tuple(N_, Di_, Hi_, Wi_, C_),
-                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, CStrideTensorA_));
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
                     in_n_di_hi_wi_c_desc,
@@ -1120,7 +1189,9 @@ struct TransformConvFwdToGemm
                                HiStride_,
                                WiStride_,
                                GStrideTensorA_,
-                               CStrideTensorA_));
+                               CStrideTensorA_),
+                    number<VectorSizeA>{},
+                    I1);
 
                 const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
                     in_n_di_hi_wi_c_desc,
@@ -1199,14 +1270,19 @@ struct TransformConvFwdToGemm
 
             if constexpr(NumGroupsToMerge == 1)
             {
-                return make_naive_tensor_descriptor_packed(make_tuple(K_, FilterSizeNumType{}));
+                return make_naive_tensor_descriptor(make_tuple(K_, FilterSizeNumType{}),
+                                                    make_tuple(FilterSizeNumType{}, I1),
+                                                    number<VectorSizeB>{},
+                                                    I1);
             }
             else
             {
 
                 const auto wei_gemmn_groups_gemmk_desc = make_naive_tensor_descriptor(
                     make_tuple(K_, NumGroupsToMerge, FilterSizeNumType{}),
-                    make_tuple(KStrideTensorB_, GStrideTensorB_, CStrideTensorB_));
+                    make_tuple(KStrideTensorB_, GStrideTensorB_, CStrideTensorB_),
+                    number<VectorSizeB>{},
+                    I1);
                 return transform_tensor_descriptor(
                     wei_gemmn_groups_gemmk_desc,
                     make_tuple(make_merge_transform(make_tuple(K_, NumGroupsToMerge)),
@@ -1219,13 +1295,18 @@ struct TransformConvFwdToGemm
         {
             if constexpr(NumGroupsToMerge == 1)
             {
-                return make_naive_tensor_descriptor_packed(make_tuple(K_, ZYX_ * C_));
+                return make_naive_tensor_descriptor(make_tuple(K_, ZYX_ * C_),
+                                                    make_tuple(ZYX_ * C_, I1),
+                                                    number<VectorSizeB>{},
+                                                    I1);
             }
             else
             {
                 const auto wei_gemmn_groups_gemmk_desc = make_naive_tensor_descriptor(
                     make_tuple(K_, NumGroupsToMerge, ZYX_ * C_),
-                    make_tuple(KStrideTensorB_, GStrideTensorB_, CStrideTensorB_));
+                    make_tuple(KStrideTensorB_, GStrideTensorB_, CStrideTensorB_),
+                    number<VectorSizeB>{},
+                    I1);
                 return transform_tensor_descriptor(
                     wei_gemmn_groups_gemmk_desc,
                     make_tuple(make_merge_transform(make_tuple(K_, NumGroupsToMerge)),
@@ -1252,14 +1333,18 @@ struct TransformConvFwdToGemm
         if constexpr(NumGroupsToMerge == 1)
         {
             return make_naive_tensor_descriptor(make_tuple(NDoHoWo, K_),
-                                                make_tuple(WoStride_, KStrideTensorC_));
+                                                make_tuple(WoStride_, KStrideTensorC_),
+                                                number<VectorSizeC>{},
+                                                I1);
         }
         else
         {
             const auto nhwo_groups_k_1_desc = make_naive_tensor_descriptor(
                 make_tuple(N_, Wo_, NumGroupsToMerge, K_, 1),
                 make_tuple(
-                    NStrideTensorC_, WoStride_, GStrideTensorC_, KStrideTensorC_, GStrideTensorC_));
+                    NStrideTensorC_, WoStride_, GStrideTensorC_, KStrideTensorC_, GStrideTensorC_),
+                number<VectorSizeC>{},
+                I1);
             // Padd 1 to NumGroupsToMerge
             const auto padded_desc = transform_tensor_descriptor(
                 nhwo_groups_k_1_desc,
@@ -1310,7 +1395,9 @@ struct TransformConvFwdToGemm
         if constexpr(NumGroupsToMerge == 1)
         {
             return make_naive_tensor_descriptor(make_tuple(NDoHoWo, K_),
-                                                make_tuple(WoStride_, KStrideTensorC_));
+                                                make_tuple(WoStride_, KStrideTensorC_),
+                                                number<VectorSizeC>{},
+                                                I1);
         }
         else
         {
@@ -1321,7 +1408,9 @@ struct TransformConvFwdToGemm
                                                         WoStride_,
                                                         GStrideTensorC_,
                                                         KStrideTensorC_,
-                                                        GStrideTensorC_));
+                                                        GStrideTensorC_),
+                                             number<VectorSizeC>{},
+                                             I1);
             // Padd 1 to NumGroupsToMerge
             const auto padded_desc = transform_tensor_descriptor(
                 nhwo_groups_k_1_desc,
@@ -1372,7 +1461,9 @@ struct TransformConvFwdToGemm
         if constexpr(NumGroupsToMerge == 1)
         {
             return make_naive_tensor_descriptor(make_tuple(NDoHoWo, K_),
-                                                make_tuple(WoStride_, KStrideTensorC_));
+                                                make_tuple(WoStride_, KStrideTensorC_),
+                                                number<VectorSizeC>{},
+                                                I1);
         }
         else
         {
@@ -1384,7 +1475,9 @@ struct TransformConvFwdToGemm
                                                         WoStride_,
                                                         GStrideTensorC_,
                                                         KStrideTensorC_,
-                                                        GStrideTensorC_));
+                                                        GStrideTensorC_),
+                                             number<VectorSizeC>{},
+                                             I1);
             // Padd 1 to NumGroupsToMerge
             const auto padded_desc = transform_tensor_descriptor(
                 nhwo_groups_k_1_desc,
@@ -1417,7 +1510,7 @@ struct TransformConvFwdToGemm
         }
     }
 
-    IndexType G_, N_;
+    IndexType G_, N_, original_N_;
     IndexType Di_, Hi_, Wi_;
     IndexType Do_, Ho_, Wo_;
     IndexType Z_, Y_, X_;
diff --git a/include/ck_tile/ops/image_to_column.hpp b/include/ck_tile/ops/image_to_column.hpp
index 93664ea138..2307b05190 100644
--- a/include/ck_tile/ops/image_to_column.hpp
+++ b/include/ck_tile/ops/image_to_column.hpp
@@ -7,5 +7,7 @@
 #include "ck_tile/ops/image_to_column/pipeline/block_image_to_column_problem.hpp"
 #include "ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp
index eb54807d88..bc20057e7a 100644
--- a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp
+++ b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp
@@ -175,9 +175,9 @@ struct ImageToColumn
     {
         const auto [M, K] = CalculateMKDims(kargs);
 
-        const index_t iM     = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
-        const index_t iK     = __builtin_amdgcn_readfirstlane(blockIdx.y * kKPerBlock);
-        const index_t iBatch = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const index_t iM     = amd_wave_read_first_lane(blockIdx.x * kMPerBlock);
+        const index_t iK     = amd_wave_read_first_lane(blockIdx.y * kKPerBlock);
+        const index_t iBatch = amd_wave_read_first_lane(blockIdx.z);
 
         const auto in_offset  = iBatch * kargs.image_g_n_c_wis_strides[I0];
         const auto out_offset = iBatch * kargs.gemm_g_m_k_strides[I0];
diff --git a/include/ck_tile/ops/layernorm2d.hpp b/include/ck_tile/ops/layernorm2d.hpp
index afbb817db1..9ce22137bf 100644
--- a/include/ck_tile/ops/layernorm2d.hpp
+++ b/include/ck_tile/ops/layernorm2d.hpp
@@ -10,5 +10,7 @@
 #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp"
 #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
index 6998b358d8..0181a3291f 100644
--- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
@@ -134,7 +134,11 @@ struct Layernorm2dFwd
         return dim3(integer_divide_ceil(hargs.m, Block_M));
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? Problem::BlockShape::template GetBlockSize<true>()
+                           : Problem::BlockShape::template GetBlockSize<false>();
+    }
 
     // clang-format off
     template <typename T> struct t2s;
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index 0de1ada87c..422950b143 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -99,7 +99,7 @@ struct Layernorm2dFwdPipelineTwoPass
         // Problem::BlockShape
         static constexpr index_t Block_N = Problem::BlockShape::Block_N;
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
 
         // total number of count assume current iter have no pad(only last iter has pad)
         constexpr index_t count_per_iter =
@@ -119,7 +119,7 @@ struct Layernorm2dFwdPipelineTwoPass
         auto mean         = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
         auto var          = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto x            = load_tile(x_window);
             auto x_resi       = load_tile(x_residual_window);
@@ -197,7 +197,7 @@ struct Layernorm2dFwdPipelineTwoPass
         move_tile_window(y_window, {0, stride_to_right_most_window});
 
         // layernorm computation
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto acc = make_static_distributed_tensor<ComputeDataType>(
                 decltype(load_tile(x_window))::get_tile_distribution());
diff --git a/include/ck_tile/ops/norm_reduce.hpp b/include/ck_tile/ops/norm_reduce.hpp
index 7dc3e8b7e7..aa074b7f9f 100644
--- a/include/ck_tile/ops/norm_reduce.hpp
+++ b/include/ck_tile/ops/norm_reduce.hpp
@@ -7,5 +7,7 @@
 #include "ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp"
 #include "ck_tile/ops/norm_reduce/thread/thread_welford.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/permute.hpp b/include/ck_tile/ops/permute.hpp
index 1cc3d9cbc3..46512c57fe 100644
--- a/include/ck_tile/ops/permute.hpp
+++ b/include/ck_tile/ops/permute.hpp
@@ -6,5 +6,7 @@
 #include "ck_tile/ops/permute/kernel/generic_permute_kernel.hpp"
 #include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp
index a6721c9305..d628e9c945 100644
--- a/include/ck_tile/ops/reduce.hpp
+++ b/include/ck_tile/ops/reduce.hpp
@@ -11,5 +11,7 @@
 #include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp"
 #include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
index 5755f38475..83a22aaded 100644
--- a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
+++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
@@ -26,6 +26,10 @@ struct Reduce
     using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
 
     static constexpr index_t kBlockSize = Problem::BlockShape::BlockSize;
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? kBlockSize / 2 : kBlockSize;
+    }
 
     private:
     // Helper function to calculate optimal vector size for input tensor
@@ -152,7 +156,7 @@ struct Reduce
         const auto merged_reduce_len =
             transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{});
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(merged_reduce_len, S::Block_N));
 
         auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
         auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
@@ -163,7 +167,7 @@ struct Reduce
         auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
         set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             const auto x = load_tile(x_window);
             block_reduce2d(x, y_compute, reduce_func);
diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
index 31eb1f2f4f..0499fe370b 100644
--- a/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
@@ -25,11 +25,18 @@ struct Reduce2dShape
     static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
     static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
 
-    static constexpr index_t ThreadPerWarp_M = Warp_M / ThreadTile_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / ThreadTile_N;
+    static constexpr index_t RepeatInWarp =
+        Warp_M * Warp_N / ThreadTile_M / ThreadTile_N / ck_tile::get_warp_size();
+    static constexpr index_t RepeatInWarp_M =
+        (Warp_M / ThreadTile_M > Warp_N / ThreadTile_N) ? RepeatInWarp : 1;
+    static constexpr index_t RepeatInWarp_N =
+        (Warp_M / ThreadTile_M > Warp_N / ThreadTile_N) ? 1 : RepeatInWarp;
 
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+    static constexpr index_t ThreadPerWarp_M = Warp_M / ThreadTile_M / RepeatInWarp_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / ThreadTile_N / RepeatInWarp_N;
+
+    static constexpr index_t Repeat_M = Block_M * RepeatInWarp_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N * RepeatInWarp_N / (WarpPerBlock_N * Warp_N);
 
     static constexpr index_t BlockSize =
         ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
diff --git a/include/ck_tile/ops/rmsnorm2d.hpp b/include/ck_tile/ops/rmsnorm2d.hpp
index 610541b2e4..00afcf4aed 100644
--- a/include/ck_tile/ops/rmsnorm2d.hpp
+++ b/include/ck_tile/ops/rmsnorm2d.hpp
@@ -11,5 +11,7 @@
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
index e7f4ce0ba8..32586a6343 100644
--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
@@ -124,7 +124,11 @@ struct Rmsnorm2dFwd
         return dim3(integer_divide_ceil(hargs.m, Block_M));
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? Problem::BlockShape::template GetBlockSize<true>()
+                           : Problem::BlockShape::template GetBlockSize<false>();
+    }
 
     // clang-format off
     template <typename T> struct t2s;
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
index d01f37879a..ca3cdc37c4 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
@@ -82,7 +82,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
         // Problem::BlockShape
         static constexpr index_t Block_N = Problem::BlockShape::Block_N;
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
 
         auto reduce_square_sum_func = ReduceOp::SquareAdd{};
         auto reduce_sum_func        = ReduceOp::Add{};
@@ -95,7 +95,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
         auto square_sum         = block_reduce2d.template MakeYBlockTile<ComputeTensorType>();
         set_tile(square_sum, reduce_square_sum_func.GetIdentityValue<ComputeDataType>());
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto x      = load_tile(x_window);
             auto x_resi = load_tile(x_residual_window);
@@ -151,7 +151,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
         move_tile_window(y_window, {0, stride_to_right_most_window});
 
         // rmsnorm computation
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto acc = make_static_distributed_tensor<ComputeDataType>(
                 decltype(load_tile(x_window))::get_tile_distribution());
diff --git a/include/ck_tile/ops/smoothquant.hpp b/include/ck_tile/ops/smoothquant.hpp
index dc164dc1a0..1aa14c69e1 100644
--- a/include/ck_tile/ops/smoothquant.hpp
+++ b/include/ck_tile/ops/smoothquant.hpp
@@ -10,5 +10,7 @@
 #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp"
 #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
index b70e996617..f6c7c0753a 100644
--- a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
@@ -93,7 +93,11 @@ struct MoeSmoothquant
         return dim3(hargs.topk, integer_divide_ceil(hargs.tokens, Block_M), 1);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? Problem::BlockShape::template GetBlockSize<true>()
+                           : Problem::BlockShape::template GetBlockSize<false>();
+    }
 
     // clang-format off
     template <typename T> struct t2s;
@@ -134,7 +138,7 @@ struct MoeSmoothquant
         const index_t i_topk  = blockIdx.x;
         const index_t i_token = blockIdx.y * Block_M;
         const index_t i_token_in_thrd =
-            __builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);
+            amd_wave_read_first_lane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);
 
         const index_t i_expert = reinterpret_cast<const index_t*>(
             kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk];
diff --git a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
index 7dc913901e..e0ea9692c5 100644
--- a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
@@ -82,7 +82,11 @@ struct Smoothquant
         return dim3(integer_divide_ceil(hargs.m, Block_M));
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return is_wave32() ? Problem::BlockShape::template GetBlockSize<true>()
+                           : Problem::BlockShape::template GetBlockSize<false>();
+    }
 
     // clang-format off
     template <typename T> struct t2s;
diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
index ba9c6374f1..8b0a7274ed 100644
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
@@ -57,7 +57,7 @@ struct SmoothquantPipelineTwoPass
 
         static constexpr index_t Block_N = Problem::BlockShape::Block_N;
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
 
         auto reduce_absmax_func  = ReduceOp::AbsMax{};
         auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
@@ -77,7 +77,7 @@ struct SmoothquantPipelineTwoPass
         auto absmax       = block_reduce2d.template MakeYBlockTile<XTensorType>();
         set_tile(absmax, reduce_absmax_func.GetIdentityValue<ComputeDataType>());
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             const auto x       = load_tile(x_window);
             const auto smscale = load_tile(smscale_window);
@@ -121,7 +121,7 @@ struct SmoothquantPipelineTwoPass
         move_tile_window(qy_window, {0, stride_to_right_most_window});
 
         // recompute y and quantize y to qy
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             const auto x       = load_tile(x_window);
             const auto smscale = load_tile(smscale_window);
diff --git a/include/ck_tile/ops/softmax.hpp b/include/ck_tile/ops/softmax.hpp
index b23e869d81..d559dc15e2 100644
--- a/include/ck_tile/ops/softmax.hpp
+++ b/include/ck_tile/ops/softmax.hpp
@@ -6,5 +6,7 @@
 #include "ck_tile/ops/softmax/block/block_softmax_2d.hpp"
 #include "ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/topk.hpp b/include/ck_tile/ops/topk.hpp
index 1dc563f757..040c6b8ddc 100644
--- a/include/ck_tile/ops/topk.hpp
+++ b/include/ck_tile/ops/topk.hpp
@@ -6,5 +6,7 @@
 #include "ck_tile/ops/topk/block/block_topk_stream_2d.hpp"
 #include "ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/topk_softmax.hpp b/include/ck_tile/ops/topk_softmax.hpp
index d0a810de4f..d9657a9764 100644
--- a/include/ck_tile/ops/topk_softmax.hpp
+++ b/include/ck_tile/ops/topk_softmax.hpp
@@ -8,5 +8,7 @@
 #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp"
 #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
index 277049f6b0..e8727ea065 100644
--- a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
+++ b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
@@ -96,9 +96,9 @@ struct TopkSoftmaxKernel
         if(block_row_id > kargs.num_rows)
             return;
 
-        index_t block_os_inp = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_input);
-        index_t block_os_out = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_output);
-        index_t num_rows_rem = __builtin_amdgcn_readfirstlane(kargs.num_rows - block_row_id);
+        index_t block_os_inp = amd_wave_read_first_lane(block_row_id * kargs.stride_input);
+        index_t block_os_out = amd_wave_read_first_lane(block_row_id * kargs.stride_output);
+        index_t num_rows_rem = amd_wave_read_first_lane(kargs.num_rows - block_row_id);
 
         const auto input_window = [&]() {
             const InputType* p_input =
diff --git a/include/ck_tile/utility/json_dump.hpp b/include/ck_tile/utility/json_dump.hpp
new file mode 100644
index 0000000000..26af906ed0
--- /dev/null
+++ b/include/ck_tile/utility/json_dump.hpp
@@ -0,0 +1,716 @@
+#ifdef CK_ENABLE_JSON_DUMP
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant"
+#include "rapidjson/writer.h"
+#include "rapidjson/stringbuffer.h"
+#include "rapidjson/document.h"
+#include "rapidjson/rapidjson.h"
+#pragma GCC diagnostic pop
+
+#define START_JSON_DUMP_FILE(file_name)                                             \
+    std::string file_str(file_name);                                                \
+    std::ofstream file(file_str);                                                   \
+    if(!file.is_open())                                                             \
+    {                                                                               \
+        throw std::runtime_error("Could not open file: " + std::string(file_name)); \
+    }                                                                               \
+    rapidjson::StringBuffer s;                                                      \
+    rapidjson::Writer<rapidjson::StringBuffer> writer(s);                           \
+    writer.StartObject();
+
+#define END_JSON_DUMP_FILE() \
+    writer.EndObject();      \
+    file << s.GetString();   \
+    file.close();            \
+    std::cout << "Results written to " << file_str << " successfully" << std::endl;
+
+#define ADD_KEY_VALUE(key, value) add_key_value_pair(writer, key, value);
+#define ADD_PERF_TO_JSON(_time, tflops, gbytes) add_perf_to_json(writer, _time, tflops, gbytes);
+
+template <typename T>
+void add_key_value_pair(rapidjson::Writer<rapidjson::StringBuffer>& writer,
+                        const char* key,
+                        T value)
+{
+    writer.Key(key);
+    if constexpr(std::is_same<T, const char*>::value)
+    {
+        writer.String(value, static_cast<rapidjson::SizeType>(std::strlen(value)));
+    }
+    else if constexpr(std::is_same<T, std::string>::value)
+    {
+        writer.String(value.c_str(), static_cast<rapidjson::SizeType>(value.length()));
+    }
+    else if constexpr(std::is_floating_point<T>::value)
+    {
+        writer.Double(static_cast<double>(value));
+    }
+    else if constexpr(std::is_integral<T>::value)
+    {
+        writer.Int64(static_cast<int64_t>(value));
+    }
+    else
+    {
+        static_assert(std::is_same<T, const char*>::value || std::is_floating_point<T>::value ||
+                          std::is_integral<T>::value,
+                      "Unsupported type for JSON serialization");
+    }
+}
+
+static void add_perf_to_json(rapidjson::Writer<rapidjson::StringBuffer>& writer,
+                             float time,
+                             float tflops,
+                             float gbytes)
+{
+    std::string roster("perf");
+    writer.String(roster.c_str(), static_cast<rapidjson::SizeType>(roster.length()));
+
+    writer.StartArray();
+    writer.StartObject();
+
+    add_key_value_pair(writer, "time", time);
+    add_key_value_pair(writer, "tflops", tflops);
+    add_key_value_pair(writer, "gbytes", gbytes);
+
+    writer.EndObject();
+    writer.EndArray();
+}
+
+#else
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-local-typedef"
+#define START_JSON_DUMP_FILE(file_name)
+#define END_JSON_DUMP_FILE() \
+    std::cout << "JSON dump disabled, To enable, set CK_ENABLE_JSON_DUMP cmake option" << std::endl;
+
+#define ADD_KEY_VALUE(key, value)
+#define ADD_PERF_TO_JSON(_time, tflops, gbytes)
+#endif
+
+// Helper traits to check for static member existence
+template <typename T, typename = void>
+struct has_warp_tile_members : std::false_type
+{
+};
+
+template <typename T>
+struct has_warp_tile_members<
+    T,
+    std::void_t<decltype(T::M_Warp_Tile), decltype(T::N_Warp_Tile), decltype(T::K_Warp_Tile)>>
+    : std::true_type
+{
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GemmConfig,
+          template <typename>
+          typename DTypeTraits>
+void dump_gemm_json_results(const std::string& json_filename,
+                            int M,
+                            int N,
+                            int K,
+                            int stride_A,
+                            int stride_B,
+                            int stride_C,
+                            bool persistent,
+                            bool pass,
+                            float ave_time,
+                            float tflops,
+                            float gb_per_sec,
+                            const std::string& kernel_name = "gemm_basic")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("M", M);
+    ADD_KEY_VALUE("N", N);
+    ADD_KEY_VALUE("K", K);
+    ADD_KEY_VALUE("stride_A", stride_A);
+    ADD_KEY_VALUE("stride_B", stride_B);
+    ADD_KEY_VALUE("stride_C", stride_C);
+    ADD_KEY_VALUE("A_layout", ALayout::name);
+    ADD_KEY_VALUE("B_layout", BLayout::name);
+    ADD_KEY_VALUE("C_layout", CLayout::name);
+    using TraitsADataType = DTypeTraits<ADataType>;
+    using TraitsBDataType = DTypeTraits<BDataType>;
+    using TraitsCDataType = DTypeTraits<CDataType>;
+    ADD_KEY_VALUE("A_type", TraitsADataType::name);
+    ADD_KEY_VALUE("B_type", TraitsBDataType::name);
+    ADD_KEY_VALUE("C_type", TraitsCDataType::name);
+    ADD_KEY_VALUE("structured_sparsity", GemmConfig::UseStructuredSparsity ? "on" : "off");
+
+    if constexpr(has_warp_tile_members<GemmConfig>::value)
+    {
+        ADD_KEY_VALUE("warp_tile",
+                      std::to_string(GemmConfig::M_Warp_Tile) + "x" +
+                          std::to_string(GemmConfig::N_Warp_Tile) + "x" +
+                          std::to_string(GemmConfig::K_Warp_Tile));
+    }
+    ADD_KEY_VALUE("persistent", persistent ? "on" : "off");
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec);
+    END_JSON_DUMP_FILE();
+}
+
+void dump_batched_gemm_json_results(const std::string& json_filename,
+                                    const std::string& op_name,
+                                    int M,
+                                    int N,
+                                    int K,
+                                    int stride_A,
+                                    int stride_B,
+                                    int stride_C,
+                                    int batch_stride_A,
+                                    int batch_stride_B,
+                                    int batch_stride_C,
+                                    int batch_count,
+                                    bool pass,
+                                    float ave_time,
+                                    float tflops,
+                                    float gb_per_sec,
+                                    const std::string& kernel_name = "batched_gemm_basic")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("op_name", op_name);
+    ADD_KEY_VALUE("M", M);
+    ADD_KEY_VALUE("N", N);
+    ADD_KEY_VALUE("K", K);
+    ADD_KEY_VALUE("stride_A", stride_A);
+    ADD_KEY_VALUE("stride_B", stride_B);
+    ADD_KEY_VALUE("stride_C", stride_C);
+    ADD_KEY_VALUE("batch_stride_A", batch_stride_A);
+    ADD_KEY_VALUE("batch_stride_B", batch_stride_B);
+    ADD_KEY_VALUE("batch_stride_C", batch_stride_C);
+    ADD_KEY_VALUE("batch_count", batch_count);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+void dump_grouped_gemm_json_results(const std::string& json_filename,
+                                    const std::string& op_name,
+                                    int group_count,
+                                    bool pass,
+                                    float ave_time,
+                                    float tflops,
+                                    float gb_per_sec,
+                                    const std::string& kernel_name = "grouped_gemm")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("op_name", op_name);
+    ADD_KEY_VALUE("group_count", group_count);
+    ADD_KEY_VALUE("A_layout", ALayout::name);
+    ADD_KEY_VALUE("B_layout", BLayout::name);
+    ADD_KEY_VALUE("C_layout", CLayout::name);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_flatmm_json_results(const std::string& json_filename,
+                              const std::string& datatype,
+                              int M,
+                              int N,
+                              int K,
+                              int stride_A,
+                              int stride_B,
+                              int stride_C,
+                              int kbatch,
+                              bool pass,
+                              float ave_time,
+                              float tflops,
+                              float gb_per_sec,
+                              const std::string& kernel_name = "flatmm_basic")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("DataType", datatype);
+    ADD_KEY_VALUE("M", M);
+    ADD_KEY_VALUE("N", N);
+    ADD_KEY_VALUE("K", K);
+    ADD_KEY_VALUE("StrideA", stride_A);
+    ADD_KEY_VALUE("StrideB", stride_B);
+    ADD_KEY_VALUE("StrideC", stride_C);
+    ADD_KEY_VALUE("kbatch", kbatch);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_gemm_multi_d_fp16_json_results(const std::string& json_filename,
+                                         const std::string& op_name,
+                                         int M,
+                                         int N,
+                                         int K,
+                                         int StrideA,
+                                         int StrideB,
+                                         int StrideD0,
+                                         int StrideD1,
+                                         int StrideE,
+                                         bool pass,
+                                         float ave_time,
+                                         float tflops,
+                                         float gb_per_sec,
+                                         const std::string& kernel_name = "gemm_multi_d_fp16")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("op_name", op_name);
+    ADD_KEY_VALUE("M", M);
+    ADD_KEY_VALUE("N", N);
+    ADD_KEY_VALUE("K", K);
+    ADD_KEY_VALUE("StrideA", StrideA);
+    ADD_KEY_VALUE("StrideB", StrideB);
+    ADD_KEY_VALUE("StrideD0", StrideD0);
+    ADD_KEY_VALUE("StrideD1", StrideD1);
+    ADD_KEY_VALUE("StrideE", StrideE);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_elementwise_json_results(const std::string& json_filename,
+                                   const std::string& prec,
+                                   int grid_size,
+                                   int block_size,
+                                   float ave_time,
+                                   float tflops,
+                                   float gb_per_sec,
+                                   const std::string& kernel_name = "elementwise")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("prec", prec);
+    ADD_KEY_VALUE("grid_size", grid_size);
+    ADD_KEY_VALUE("block_size", block_size);
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_layernorm2d_fwd_json_results(const std::string& json_filename,
+                                       const std::string& prec_i,
+                                       const std::string& prec_o,
+                                       const std::string& prec_sm,
+                                       const std::string& prec_sy,
+                                       int m,
+                                       int n,
+                                       int x_stride,
+                                       int xr_stride,
+                                       int y_stride,
+                                       int yr_stride,
+                                       bool pass,
+                                       float ave_time,
+                                       float tflops,
+                                       float gb_per_sec,
+                                       const std::string& kernel_name = "layernorm2d_fwd")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("prec_i", prec_i);
+    ADD_KEY_VALUE("prec_o", prec_o);
+    ADD_KEY_VALUE("prec_sm", prec_sm);
+    ADD_KEY_VALUE("prec_sy", prec_sy);
+    ADD_KEY_VALUE("m", m);
+    ADD_KEY_VALUE("n", n);
+    ADD_KEY_VALUE("x_stride", x_stride);
+    ADD_KEY_VALUE("xr_stride", xr_stride);
+    ADD_KEY_VALUE("y_stride", y_stride);
+    ADD_KEY_VALUE("yr_stride", yr_stride);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+template <typename DataType, template <typename> typename DTypeTraits>
+void dump_reduce_json_results(const std::string& json_filename,
+                              int N,
+                              int C,
+                              int H,
+                              int W,
+                              bool pass,
+                              float ave_time,
+                              float tflops,
+                              float gb_per_sec,
+                              const std::string& kernel_name = "reduce")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    using Traits = DTypeTraits<DataType>;
+    ADD_KEY_VALUE("data_type", Traits::name);
+    ADD_KEY_VALUE("N", N);
+    ADD_KEY_VALUE("C", C);
+    ADD_KEY_VALUE("H", H);
+    ADD_KEY_VALUE("W", W);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_permute_json_results(const std::string& json_filename,
+                               const std::string& data_type,
+                               bool pass,
+                               float ave_time,
+                               float tflop,
+                               float gb_per_sec,
+                               const std::string& kernel_name = "permute")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("data_type", data_type);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflop, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_topk_softmax_json(const std::string& json_filename,
+                            const std::string& input_prec,
+                            const std::string& weight_prec,
+                            int tokens,
+                            int experts,
+                            int topk,
+                            int stride_input,
+                            int stride_output,
+                            float ave_time,
+                            float tflop,
+                            float gb_per_sec,
+                            bool pass,
+                            const std::string& kernel_name = "topk_softmax")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("input_prec", input_prec);
+    ADD_KEY_VALUE("weight_prec", weight_prec);
+    ADD_KEY_VALUE("tokens", tokens);
+    ADD_KEY_VALUE("experts", experts);
+    ADD_KEY_VALUE("topk", topk);
+    ADD_KEY_VALUE("stride_input", stride_input);
+    ADD_KEY_VALUE("stride_output", stride_output);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflop, gb_per_sec);
+    END_JSON_DUMP_FILE();
+}
+
+void dump_rmsnorm2d_fwd_json(const std::string& json_filename,
+                             const std::string& prec_str,
+                             int m,
+                             int n,
+                             int x_stride,
+                             int xr_stride,
+                             int y_stride,
+                             int yr_stride,
+                             int use_model_sensitive_rmsnorm,
+                             float ave_time,
+                             float tflops,
+                             float gb_per_sec,
+                             bool pass,
+                             const std::string& kernel_name = "rmsnorm2d_fwd")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("prec", prec_str);
+    ADD_KEY_VALUE("m", m);
+    ADD_KEY_VALUE("n", n);
+    ADD_KEY_VALUE("x_stride", x_stride);
+    ADD_KEY_VALUE("xr_stride", xr_stride);
+    ADD_KEY_VALUE("y_stride", y_stride);
+    ADD_KEY_VALUE("yr_stride", yr_stride);
+    ADD_KEY_VALUE("use_model_sensitive_rmsnorm", use_model_sensitive_rmsnorm);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec);
+    END_JSON_DUMP_FILE();
+}
+
+void dump_add_rmsnorm2d_rdquant_fwd_json(
+    const std::string& json_filename,
+    const std::string& input_data_type,
+    const std::string& quantized_data_type,
+    int m,
+    int n,
+    int stride,
+    float epsilon,
+    float ave_time,
+    float tflops,
+    float gb_per_sec,
+    bool pass,
+    const std::string& kernel_name = "add_rmsnorm2d_rdquant_fwd")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("input_data_type", input_data_type);
+    ADD_KEY_VALUE("quantized_data_type", quantized_data_type);
+    ADD_KEY_VALUE("m", m);
+    ADD_KEY_VALUE("n", n);
+    ADD_KEY_VALUE("stride", stride);
+    ADD_KEY_VALUE("epsilon", epsilon);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec);
+    END_JSON_DUMP_FILE();
+}
+
+void dump_smoothquant_json(const std::string& json_filename,
+                           const std::string& prec_str,
+                           int m,
+                           int n,
+                           int x_stride,
+                           int y_stride,
+                           float ave_time,
+                           float tflops,
+                           float gb_per_sec,
+                           bool pass,
+                           const std::string& kernel_name = "smoothquant")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("prec", prec_str);
+    ADD_KEY_VALUE("m", m);
+    ADD_KEY_VALUE("n", n);
+    ADD_KEY_VALUE("x_stride", x_stride);
+    ADD_KEY_VALUE("y_stride", y_stride);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec);
+    END_JSON_DUMP_FILE();
+}
+
+void dump_moe_sorting_json(const std::string& json_filename,
+                           const std::string& index_prec,
+                           const std::string& weight_prec,
+                           const std::string& workspace_size,
+                           int dispatch_policy,
+                           int tokens,
+                           int num_experts,
+                           int topk,
+                           float ave_time,
+                           float tflops,
+                           float gb_per_sec,
+                           bool pass,
+                           const std::string& kernel_name = "moe_sorting")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("index_prec", index_prec);
+    ADD_KEY_VALUE("weight_prec", weight_prec);
+    ADD_KEY_VALUE("workspace_size", workspace_size);
+    ADD_KEY_VALUE("dispatch_policy", dispatch_policy);
+    ADD_KEY_VALUE("tokens", tokens);
+    ADD_KEY_VALUE("num_experts", num_experts);
+    ADD_KEY_VALUE("topk", topk);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_batched_transpose_json(const std::string& json_filename,
+                                 int N,
+                                 int C,
+                                 int H,
+                                 int W,
+                                 const std::string& layout_in,
+                                 const std::string& layout_out,
+                                 const std::string& prec,
+                                 float ave_time,
+                                 float tflops,
+                                 float gb_per_sec,
+                                 bool pass,
+                                 const std::string& kernel_name = "batched_transpose")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("N", N);
+    ADD_KEY_VALUE("C", C);
+    ADD_KEY_VALUE("H", H);
+    ADD_KEY_VALUE("W", W);
+    ADD_KEY_VALUE("LayoutIn", layout_in);
+    ADD_KEY_VALUE("LayoutOut", layout_out);
+    ADD_KEY_VALUE("Precision", prec);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_moe_smoothquant_json(const std::string& json_filename,
+                               const std::string& prec_i,
+                               const std::string& prec_o,
+                               int tokens,
+                               int hidden_size,
+                               int stride,
+                               int experts,
+                               int topk,
+                               bool pass,
+                               float ave_time,
+                               float tflops,
+                               float gb_per_sec,
+                               const std::string& kernel_name = "moe_smoothquant")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("prec_i", prec_i);
+    ADD_KEY_VALUE("prec_o", prec_o);
+    ADD_KEY_VALUE("tokens", tokens);
+    ADD_KEY_VALUE("hidden_size", hidden_size);
+    ADD_KEY_VALUE("stride", stride);
+    ADD_KEY_VALUE("experts", experts);
+    ADD_KEY_VALUE("topk", topk);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_fused_moe_json(const std::string& json_filename,
+                         const std::string& api_str,
+                         const std::string& prec_str,
+                         int tokens,
+                         bool is_local_token,
+                         int local_tokens,
+                         int experts,
+                         int topk,
+                         int hidden_size,
+                         int intermediate_size,
+                         int stride,
+                         int block_m,
+                         int activation,
+                         bool gate_only,
+                         bool fused_quant,
+                         bool pass,
+                         float ave_time,
+                         float tflops,
+                         float tb_per_sec,
+                         const std::string& kernel_name = "fused_moe")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("api", api_str);
+    ADD_KEY_VALUE("prec", prec_str);
+    ADD_KEY_VALUE("tokens", tokens);
+    if(is_local_token)
+    {
+        ADD_KEY_VALUE("local_tokens", local_tokens);
+    }
+    ADD_KEY_VALUE("experts", experts);
+    ADD_KEY_VALUE("topk", topk);
+    ADD_KEY_VALUE("hidden_size", hidden_size);
+    ADD_KEY_VALUE("intermediate_size", intermediate_size);
+    ADD_KEY_VALUE("stride", stride);
+    ADD_KEY_VALUE("block_m", block_m);
+    ADD_KEY_VALUE("activation", activation);
+    ADD_KEY_VALUE("gate_only", gate_only);
+    ADD_KEY_VALUE("fused_quant", fused_quant);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, (tb_per_sec * 1024.0f))
+    END_JSON_DUMP_FILE();
+}
+
+void dump_fmha_fwd_json_results(const std::string& json_filename,
+                                const std::string& prec,
+                                const std::string& mode,
+                                const std::string& io_layout,
+                                int batch,
+                                int nhead,
+                                int nhead_k,
+                                int seqlen_qs,
+                                int seqlen_ks,
+                                int seqlen_kpads,
+                                int hdim_q,
+                                int hdim_v,
+                                float scale_s,
+                                float p_drop,
+                                bool lse,
+                                bool squant,
+                                const std::string& bias,
+                                const std::string& vlayout,
+                                bool pass,
+                                float ave_time,
+                                float tflops,
+                                float gb_per_sec,
+                                const std::string& kernel_name = "fmha_fwd")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("prec", prec);
+    ADD_KEY_VALUE("mode", mode);
+    ADD_KEY_VALUE("io_layout", io_layout);
+    ADD_KEY_VALUE("batch", batch);
+    ADD_KEY_VALUE("nhead", nhead);
+    ADD_KEY_VALUE("nhead_k", nhead_k);
+    ADD_KEY_VALUE("seqlen_q", seqlen_qs);
+    ADD_KEY_VALUE("seqlen_k", seqlen_ks);
+    ADD_KEY_VALUE("seqlen_kpads", seqlen_kpads);
+    ADD_KEY_VALUE("hdim_q", hdim_q);
+    ADD_KEY_VALUE("hdim_v", hdim_v);
+    ADD_KEY_VALUE("scale_s", scale_s);
+    ADD_KEY_VALUE("p_drop", p_drop);
+    ADD_KEY_VALUE("lse", lse);
+    ADD_KEY_VALUE("squant", squant);
+    ADD_KEY_VALUE("bias", bias);
+    ADD_KEY_VALUE("vlayout", vlayout);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+void dump_fmha_bwd_json_results(const std::string& json_filename,
+                                const std::string& data_type,
+                                const std::string& mode,
+                                const std::string& i_perm,
+                                const std::string& o_perm,
+                                int batch,
+                                int nhead,
+                                int nhead_k,
+                                int seqlen_q,
+                                int seqlen_k,
+                                int hdim_q,
+                                int hdim_v,
+                                float scale,
+                                const std::string& bias,
+                                bool use_dbias,
+                                float p_drop,
+                                bool s_randval,
+                                bool deterministic,
+                                const std::string& mask,
+                                int mask_left,
+                                int mask_right,
+                                int workspace_size,
+                                bool pass,
+                                float ave_time,
+                                float tflops,
+                                float gb_per_sec,
+                                const std::string& kernel_name = "fmha_bwd")
+{
+    START_JSON_DUMP_FILE(json_filename);
+    ADD_KEY_VALUE("name", kernel_name);
+    ADD_KEY_VALUE("prec", data_type);
+    ADD_KEY_VALUE("mode", mode);
+    ADD_KEY_VALUE("i_perm", i_perm);
+    ADD_KEY_VALUE("o_perm", o_perm);
+    ADD_KEY_VALUE("batch", batch);
+    ADD_KEY_VALUE("nhead", nhead);
+    ADD_KEY_VALUE("nhead_k", nhead_k);
+    ADD_KEY_VALUE("seqlen_q", seqlen_q);
+    ADD_KEY_VALUE("seqlen_k", seqlen_k);
+    ADD_KEY_VALUE("hdim_q", hdim_q);
+    ADD_KEY_VALUE("hdim_v", hdim_v);
+    ADD_KEY_VALUE("scale", scale);
+    ADD_KEY_VALUE("bias", bias);
+    ADD_KEY_VALUE("use_dbias", use_dbias);
+    ADD_KEY_VALUE("p_drop", p_drop);
+    ADD_KEY_VALUE("s_randval", s_randval);
+    ADD_KEY_VALUE("deterministic", deterministic ? "true" : "false");
+    ADD_KEY_VALUE("mask", mask);
+    ADD_KEY_VALUE("mask_left", mask_left);
+    ADD_KEY_VALUE("mask_right", mask_right);
+    ADD_KEY_VALUE("workspace_size", workspace_size);
+    ADD_KEY_VALUE("verification", pass ? "pass" : "fail");
+    ADD_PERF_TO_JSON(ave_time, tflops, gb_per_sec)
+    END_JSON_DUMP_FILE();
+}
+
+#ifndef CK_ENABLE_JSON_DUMP
+#pragma GCC diagnostic pop
+#endif
diff --git a/include/rapidjson/allocators.h b/include/rapidjson/allocators.h
new file mode 100644
index 0000000000..275417bd8b
--- /dev/null
+++ b/include/rapidjson/allocators.h
@@ -0,0 +1,693 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_ALLOCATORS_H_
+#define RAPIDJSON_ALLOCATORS_H_
+
+#include "rapidjson.h"
+#include "internal/meta.h"
+
+#include <memory>
+#include <limits>
+
+#if RAPIDJSON_HAS_CXX11
+#include <type_traits>
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+// Allocator
+
+/*! \class rapidjson::Allocator
+    \brief Concept for allocating, resizing and freeing memory block.
+    
+    Note that Malloc() and Realloc() are non-static but Free() is static.
+    
+    So if an allocator need to support Free(), it needs to put its pointer in 
+    the header of memory block.
+
+\code
+concept Allocator {
+    static const bool kNeedFree;    //!< Whether this allocator needs to call Free().
+
+    // Allocate a memory block.
+    // \param size of the memory block in bytes.
+    // \returns pointer to the memory block.
+    void* Malloc(size_t size);
+
+    // Resize a memory block.
+    // \param originalPtr The pointer to current memory block. Null pointer is permitted.
+    // \param originalSize The current size in bytes. (Design issue: since some allocator may not book-keep this, explicitly pass to it can save memory.)
+    // \param newSize the new size in bytes.
+    void* Realloc(void* originalPtr, size_t originalSize, size_t newSize);
+
+    // Free a memory block.
+    // \param pointer to the memory block. Null pointer is permitted.
+    static void Free(void *ptr);
+};
+\endcode
+*/
+
+
+/*! \def RAPIDJSON_ALLOCATOR_DEFAULT_CHUNK_CAPACITY
+    \ingroup RAPIDJSON_CONFIG
+    \brief User-defined kDefaultChunkCapacity definition.
+
+    User can define this as any \c size that is a power of 2.
+*/
+
+#ifndef RAPIDJSON_ALLOCATOR_DEFAULT_CHUNK_CAPACITY
+#define RAPIDJSON_ALLOCATOR_DEFAULT_CHUNK_CAPACITY (64 * 1024)
+#endif
+
+
+///////////////////////////////////////////////////////////////////////////////
+// CrtAllocator
+
+//! C-runtime library allocator.
+/*! This class is just wrapper for standard C library memory routines.
+    \note implements Allocator concept
+*/
+class CrtAllocator {
+public:
+    static const bool kNeedFree = true;
+    void* Malloc(size_t size) { 
+        if (size) //  behavior of malloc(0) is implementation defined.
+            return RAPIDJSON_MALLOC(size);
+        else
+            return NULL; // standardize to returning NULL.
+    }
+    void* Realloc(void* originalPtr, size_t originalSize, size_t newSize) {
+        (void)originalSize;
+        if (newSize == 0) {
+            RAPIDJSON_FREE(originalPtr);
+            return NULL;
+        }
+        return RAPIDJSON_REALLOC(originalPtr, newSize);
+    }
+    static void Free(void *ptr) RAPIDJSON_NOEXCEPT { RAPIDJSON_FREE(ptr); }
+
+    bool operator==(const CrtAllocator&) const RAPIDJSON_NOEXCEPT {
+        return true;
+    }
+    bool operator!=(const CrtAllocator&) const RAPIDJSON_NOEXCEPT {
+        return false;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// MemoryPoolAllocator
+
+//! Default memory allocator used by the parser and DOM.
+/*! This allocator allocate memory blocks from pre-allocated memory chunks. 
+
+    It does not free memory blocks. And Realloc() only allocate new memory.
+
+    The memory chunks are allocated by BaseAllocator, which is CrtAllocator by default.
+
+    User may also supply a buffer as the first chunk.
+
+    If the user-buffer is full then additional chunks are allocated by BaseAllocator.
+
+    The user-buffer is not deallocated by this allocator.
+
+    \tparam BaseAllocator the allocator type for allocating memory chunks. Default is CrtAllocator.
+    \note implements Allocator concept
+*/
+template <typename BaseAllocator = CrtAllocator>
+class MemoryPoolAllocator {
+    //! Chunk header for perpending to each chunk.
+    /*! Chunks are stored as a singly linked list.
+    */
+    struct ChunkHeader {
+        size_t capacity;    //!< Capacity of the chunk in bytes (excluding the header itself).
+        size_t size;        //!< Current size of allocated memory in bytes.
+        ChunkHeader *next;  //!< Next chunk in the linked list.
+    };
+
+    struct SharedData {
+        ChunkHeader *chunkHead;  //!< Head of the chunk linked-list. Only the head chunk serves allocation.
+        BaseAllocator* ownBaseAllocator; //!< base allocator created by this object.
+        size_t refcount;
+        bool ownBuffer;
+    };
+
+    static const size_t SIZEOF_SHARED_DATA = RAPIDJSON_ALIGN(sizeof(SharedData));
+    static const size_t SIZEOF_CHUNK_HEADER = RAPIDJSON_ALIGN(sizeof(ChunkHeader));
+
+    static inline ChunkHeader *GetChunkHead(SharedData *shared)
+    {
+        return reinterpret_cast<ChunkHeader*>(reinterpret_cast<uint8_t*>(shared) + SIZEOF_SHARED_DATA);
+    }
+    static inline uint8_t *GetChunkBuffer(SharedData *shared)
+    {
+        return reinterpret_cast<uint8_t*>(shared->chunkHead) + SIZEOF_CHUNK_HEADER;
+    }
+
+    static const size_t kDefaultChunkCapacity = RAPIDJSON_ALLOCATOR_DEFAULT_CHUNK_CAPACITY; //!< Default chunk capacity.
+
+public:
+    static const bool kNeedFree = false;    //!< Tell users that no need to call Free() with this allocator. (concept Allocator)
+    static const bool kRefCounted = true;   //!< Tell users that this allocator is reference counted on copy
+
+    //! Constructor with chunkSize.
+    /*! \param chunkSize The size of memory chunk. The default is kDefaultChunkSize.
+        \param baseAllocator The allocator for allocating memory chunks.
+    */
+    explicit
+    MemoryPoolAllocator(size_t chunkSize = kDefaultChunkCapacity, BaseAllocator* baseAllocator = 0) : 
+        chunk_capacity_(chunkSize),
+        baseAllocator_(baseAllocator ? baseAllocator : RAPIDJSON_NEW(BaseAllocator)()),
+        shared_(static_cast<SharedData*>(baseAllocator_ ? baseAllocator_->Malloc(SIZEOF_SHARED_DATA + SIZEOF_CHUNK_HEADER) : 0))
+    {
+        RAPIDJSON_ASSERT(baseAllocator_ != 0);
+        RAPIDJSON_ASSERT(shared_ != 0);
+        if (baseAllocator) {
+            shared_->ownBaseAllocator = 0;
+        }
+        else {
+            shared_->ownBaseAllocator = baseAllocator_;
+        }
+        shared_->chunkHead = GetChunkHead(shared_);
+        shared_->chunkHead->capacity = 0;
+        shared_->chunkHead->size = 0;
+        shared_->chunkHead->next = 0;
+        shared_->ownBuffer = true;
+        shared_->refcount = 1;
+    }
+
+    //! Constructor with user-supplied buffer.
+    /*! The user buffer will be used firstly. When it is full, memory pool allocates new chunk with chunk size.
+
+        The user buffer will not be deallocated when this allocator is destructed.
+
+        \param buffer User supplied buffer.
+        \param size Size of the buffer in bytes. It must at least larger than sizeof(ChunkHeader).
+        \param chunkSize The size of memory chunk. The default is kDefaultChunkSize.
+        \param baseAllocator The allocator for allocating memory chunks.
+    */
+    MemoryPoolAllocator(void *buffer, size_t size, size_t chunkSize = kDefaultChunkCapacity, BaseAllocator* baseAllocator = 0) :
+        chunk_capacity_(chunkSize),
+        baseAllocator_(baseAllocator),
+        shared_(static_cast<SharedData*>(AlignBuffer(buffer, size)))
+    {
+        RAPIDJSON_ASSERT(size >= SIZEOF_SHARED_DATA + SIZEOF_CHUNK_HEADER);
+        shared_->chunkHead = GetChunkHead(shared_);
+        shared_->chunkHead->capacity = size - SIZEOF_SHARED_DATA - SIZEOF_CHUNK_HEADER;
+        shared_->chunkHead->size = 0;
+        shared_->chunkHead->next = 0;
+        shared_->ownBaseAllocator = 0;
+        shared_->ownBuffer = false;
+        shared_->refcount = 1;
+    }
+
+    MemoryPoolAllocator(const MemoryPoolAllocator& rhs) RAPIDJSON_NOEXCEPT :
+        chunk_capacity_(rhs.chunk_capacity_),
+        baseAllocator_(rhs.baseAllocator_),
+        shared_(rhs.shared_)
+    {
+        RAPIDJSON_NOEXCEPT_ASSERT(shared_->refcount > 0);
+        ++shared_->refcount;
+    }
+    MemoryPoolAllocator& operator=(const MemoryPoolAllocator& rhs) RAPIDJSON_NOEXCEPT
+    {
+        RAPIDJSON_NOEXCEPT_ASSERT(rhs.shared_->refcount > 0);
+        ++rhs.shared_->refcount;
+        this->~MemoryPoolAllocator();
+        baseAllocator_ = rhs.baseAllocator_;
+        chunk_capacity_ = rhs.chunk_capacity_;
+        shared_ = rhs.shared_;
+        return *this;
+    }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    MemoryPoolAllocator(MemoryPoolAllocator&& rhs) RAPIDJSON_NOEXCEPT :
+        chunk_capacity_(rhs.chunk_capacity_),
+        baseAllocator_(rhs.baseAllocator_),
+        shared_(rhs.shared_)
+    {
+        RAPIDJSON_NOEXCEPT_ASSERT(rhs.shared_->refcount > 0);
+        rhs.shared_ = 0;
+    }
+    MemoryPoolAllocator& operator=(MemoryPoolAllocator&& rhs) RAPIDJSON_NOEXCEPT
+    {
+        RAPIDJSON_NOEXCEPT_ASSERT(rhs.shared_->refcount > 0);
+        this->~MemoryPoolAllocator();
+        baseAllocator_ = rhs.baseAllocator_;
+        chunk_capacity_ = rhs.chunk_capacity_;
+        shared_ = rhs.shared_;
+        rhs.shared_ = 0;
+        return *this;
+    }
+#endif
+
+    //! Destructor.
+    /*! This deallocates all memory chunks, excluding the user-supplied buffer.
+    */
+    ~MemoryPoolAllocator() RAPIDJSON_NOEXCEPT {
+        if (!shared_) {
+            // do nothing if moved
+            return;
+        }
+        if (shared_->refcount > 1) {
+            --shared_->refcount;
+            return;
+        }
+        Clear();
+        BaseAllocator *a = shared_->ownBaseAllocator;
+        if (shared_->ownBuffer) {
+            baseAllocator_->Free(shared_);
+        }
+        RAPIDJSON_DELETE(a);
+    }
+
+    //! Deallocates all memory chunks, excluding the first/user one.
+    void Clear() RAPIDJSON_NOEXCEPT {
+        RAPIDJSON_NOEXCEPT_ASSERT(shared_->refcount > 0);
+        for (;;) {
+            ChunkHeader* c = shared_->chunkHead;
+            if (!c->next) {
+                break;
+            }
+            shared_->chunkHead = c->next;
+            baseAllocator_->Free(c);
+        }
+        shared_->chunkHead->size = 0;
+    }
+
+    //! Computes the total capacity of allocated memory chunks.
+    /*! \return total capacity in bytes.
+    */
+    size_t Capacity() const RAPIDJSON_NOEXCEPT {
+        RAPIDJSON_NOEXCEPT_ASSERT(shared_->refcount > 0);
+        size_t capacity = 0;
+        for (ChunkHeader* c = shared_->chunkHead; c != 0; c = c->next)
+            capacity += c->capacity;
+        return capacity;
+    }
+
+    //! Computes the memory blocks allocated.
+    /*! \return total used bytes.
+    */
+    size_t Size() const RAPIDJSON_NOEXCEPT {
+        RAPIDJSON_NOEXCEPT_ASSERT(shared_->refcount > 0);
+        size_t size = 0;
+        for (ChunkHeader* c = shared_->chunkHead; c != 0; c = c->next)
+            size += c->size;
+        return size;
+    }
+
+    //! Whether the allocator is shared.
+    /*! \return true or false.
+    */
+    bool Shared() const RAPIDJSON_NOEXCEPT {
+        RAPIDJSON_NOEXCEPT_ASSERT(shared_->refcount > 0);
+        return shared_->refcount > 1;
+    }
+
+    //! Allocates a memory block. (concept Allocator)
+    void* Malloc(size_t size) {
+        RAPIDJSON_NOEXCEPT_ASSERT(shared_->refcount > 0);
+        if (!size)
+            return NULL;
+
+        size = RAPIDJSON_ALIGN(size);
+        if (RAPIDJSON_UNLIKELY(shared_->chunkHead->size + size > shared_->chunkHead->capacity))
+            if (!AddChunk(chunk_capacity_ > size ? chunk_capacity_ : size))
+                return NULL;
+
+        void *buffer = GetChunkBuffer(shared_) + shared_->chunkHead->size;
+        shared_->chunkHead->size += size;
+        return buffer;
+    }
+
+    //! Resizes a memory block (concept Allocator)
+    void* Realloc(void* originalPtr, size_t originalSize, size_t newSize) {
+        if (originalPtr == 0)
+            return Malloc(newSize);
+
+        RAPIDJSON_NOEXCEPT_ASSERT(shared_->refcount > 0);
+        if (newSize == 0)
+            return NULL;
+
+        originalSize = RAPIDJSON_ALIGN(originalSize);
+        newSize = RAPIDJSON_ALIGN(newSize);
+
+        // Do not shrink if new size is smaller than original
+        if (originalSize >= newSize)
+            return originalPtr;
+
+        // Simply expand it if it is the last allocation and there is sufficient space
+        if (originalPtr == GetChunkBuffer(shared_) + shared_->chunkHead->size - originalSize) {
+            size_t increment = static_cast<size_t>(newSize - originalSize);
+            if (shared_->chunkHead->size + increment <= shared_->chunkHead->capacity) {
+                shared_->chunkHead->size += increment;
+                return originalPtr;
+            }
+        }
+
+        // Realloc process: allocate and copy memory, do not free original buffer.
+        if (void* newBuffer = Malloc(newSize)) {
+            if (originalSize)
+                std::memcpy(newBuffer, originalPtr, originalSize);
+            return newBuffer;
+        }
+        else
+            return NULL;
+    }
+
+    //! Frees a memory block (concept Allocator)
+    static void Free(void *ptr) RAPIDJSON_NOEXCEPT { (void)ptr; } // Do nothing
+
+    //! Compare (equality) with another MemoryPoolAllocator
+    bool operator==(const MemoryPoolAllocator& rhs) const RAPIDJSON_NOEXCEPT {
+        RAPIDJSON_NOEXCEPT_ASSERT(shared_->refcount > 0);
+        RAPIDJSON_NOEXCEPT_ASSERT(rhs.shared_->refcount > 0);
+        return shared_ == rhs.shared_;
+    }
+    //! Compare (inequality) with another MemoryPoolAllocator
+    bool operator!=(const MemoryPoolAllocator& rhs) const RAPIDJSON_NOEXCEPT {
+        return !operator==(rhs);
+    }
+
+private:
+    //! Creates a new chunk.
+    /*! \param capacity Capacity of the chunk in bytes.
+        \return true if success.
+    */
+    bool AddChunk(size_t capacity) {
+        if (!baseAllocator_)
+            shared_->ownBaseAllocator = baseAllocator_ = RAPIDJSON_NEW(BaseAllocator)();
+        if (ChunkHeader* chunk = static_cast<ChunkHeader*>(baseAllocator_->Malloc(SIZEOF_CHUNK_HEADER + capacity))) {
+            chunk->capacity = capacity;
+            chunk->size = 0;
+            chunk->next = shared_->chunkHead;
+            shared_->chunkHead = chunk;
+            return true;
+        }
+        else
+            return false;
+    }
+
+    static inline void* AlignBuffer(void* buf, size_t &size)
+    {
+        RAPIDJSON_NOEXCEPT_ASSERT(buf != 0);
+        const uintptr_t mask = sizeof(void*) - 1;
+        const uintptr_t ubuf = reinterpret_cast<uintptr_t>(buf);
+        if (RAPIDJSON_UNLIKELY(ubuf & mask)) {
+            const uintptr_t abuf = (ubuf + mask) & ~mask;
+            RAPIDJSON_ASSERT(size >= abuf - ubuf);
+            buf = reinterpret_cast<void*>(abuf);
+            size -= abuf - ubuf;
+        }
+        return buf;
+    }
+
+    size_t chunk_capacity_;     //!< The minimum capacity of chunk when they are allocated.
+    BaseAllocator* baseAllocator_;  //!< base allocator for allocating memory chunks.
+    SharedData *shared_;        //!< The shared data of the allocator
+};
+
+namespace internal {
+    template<typename, typename = void>
+    struct IsRefCounted :
+        public FalseType
+    { };
+    template<typename T>
+    struct IsRefCounted<T, typename internal::EnableIfCond<T::kRefCounted>::Type> :
+        public TrueType
+    { };
+}
+
+template<typename T, typename A>
+inline T* Realloc(A& a, T* old_p, size_t old_n, size_t new_n)
+{
+    RAPIDJSON_NOEXCEPT_ASSERT(old_n <= (std::numeric_limits<size_t>::max)() / sizeof(T) && new_n <= (std::numeric_limits<size_t>::max)() / sizeof(T));
+    return static_cast<T*>(a.Realloc(old_p, old_n * sizeof(T), new_n * sizeof(T)));
+}
+
+template<typename T, typename A>
+inline T *Malloc(A& a, size_t n = 1)
+{
+    return Realloc<T, A>(a, NULL, 0, n);
+}
+
+template<typename T, typename A>
+inline void Free(A& a, T *p, size_t n = 1)
+{
+    static_cast<void>(Realloc<T, A>(a, p, n, 0));
+}
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++) // std::allocator can safely be inherited
+#endif
+
+template <typename T, typename BaseAllocator = CrtAllocator>
+class StdAllocator :
+    public std::allocator<T>
+{
+    typedef std::allocator<T> allocator_type;
+#if RAPIDJSON_HAS_CXX11
+    typedef std::allocator_traits<allocator_type> traits_type;
+#else
+    typedef allocator_type traits_type;
+#endif
+
+public:
+    typedef BaseAllocator BaseAllocatorType;
+
+    StdAllocator() RAPIDJSON_NOEXCEPT :
+        allocator_type(),
+        baseAllocator_()
+    { }
+
+    StdAllocator(const StdAllocator& rhs) RAPIDJSON_NOEXCEPT :
+        allocator_type(rhs),
+        baseAllocator_(rhs.baseAllocator_)
+    { }
+
+    template<typename U>
+    StdAllocator(const StdAllocator<U, BaseAllocator>& rhs) RAPIDJSON_NOEXCEPT :
+        allocator_type(rhs),
+        baseAllocator_(rhs.baseAllocator_)
+    { }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    StdAllocator(StdAllocator&& rhs) RAPIDJSON_NOEXCEPT :
+        allocator_type(std::move(rhs)),
+        baseAllocator_(std::move(rhs.baseAllocator_))
+    { }
+#endif
+#if RAPIDJSON_HAS_CXX11
+    using propagate_on_container_move_assignment = std::true_type;
+    using propagate_on_container_swap = std::true_type;
+#endif
+
+    /* implicit */
+    StdAllocator(const BaseAllocator& baseAllocator) RAPIDJSON_NOEXCEPT :
+        allocator_type(),
+        baseAllocator_(baseAllocator)
+    { }
+
+    ~StdAllocator() RAPIDJSON_NOEXCEPT
+    { }
+
+    template<typename U>
+    struct rebind {
+        typedef StdAllocator<U, BaseAllocator> other;
+    };
+
+    typedef typename traits_type::size_type         size_type;
+    typedef typename traits_type::difference_type   difference_type;
+
+    typedef typename traits_type::value_type        value_type;
+    typedef typename traits_type::pointer           pointer;
+    typedef typename traits_type::const_pointer     const_pointer;
+
+#if RAPIDJSON_HAS_CXX11
+
+    typedef typename std::add_lvalue_reference<value_type>::type &reference;
+    typedef typename std::add_lvalue_reference<typename std::add_const<value_type>::type>::type &const_reference;
+
+    pointer address(reference r) const RAPIDJSON_NOEXCEPT
+    {
+        return std::addressof(r);
+    }
+    const_pointer address(const_reference r) const RAPIDJSON_NOEXCEPT
+    {
+        return std::addressof(r);
+    }
+
+    size_type max_size() const RAPIDJSON_NOEXCEPT
+    {
+        return traits_type::max_size(*this);
+    }
+
+    template <typename ...Args>
+    void construct(pointer p, Args&&... args)
+    {
+        traits_type::construct(*this, p, std::forward<Args>(args)...);
+    }
+    void destroy(pointer p)
+    {
+        traits_type::destroy(*this, p);
+    }
+
+#else // !RAPIDJSON_HAS_CXX11
+
+    typedef typename allocator_type::reference       reference;
+    typedef typename allocator_type::const_reference const_reference;
+
+    pointer address(reference r) const RAPIDJSON_NOEXCEPT
+    {
+        return allocator_type::address(r);
+    }
+    const_pointer address(const_reference r) const RAPIDJSON_NOEXCEPT
+    {
+        return allocator_type::address(r);
+    }
+
+    size_type max_size() const RAPIDJSON_NOEXCEPT
+    {
+        return allocator_type::max_size();
+    }
+
+    void construct(pointer p, const_reference r)
+    {
+        allocator_type::construct(p, r);
+    }
+    void destroy(pointer p)
+    {
+        allocator_type::destroy(p);
+    }
+
+#endif // !RAPIDJSON_HAS_CXX11
+
+    template <typename U>
+    U* allocate(size_type n = 1, const void* = 0)
+    {
+        return RAPIDJSON_NAMESPACE::Malloc<U>(baseAllocator_, n);
+    }
+    template <typename U>
+    void deallocate(U* p, size_type n = 1)
+    {
+        RAPIDJSON_NAMESPACE::Free<U>(baseAllocator_, p, n);
+    }
+
+    pointer allocate(size_type n = 1, const void* = 0)
+    {
+        return allocate<value_type>(n);
+    }
+    void deallocate(pointer p, size_type n = 1)
+    {
+        deallocate<value_type>(p, n);
+    }
+
+#if RAPIDJSON_HAS_CXX11
+    using is_always_equal = std::is_empty<BaseAllocator>;
+#endif
+
+    template<typename U>
+    bool operator==(const StdAllocator<U, BaseAllocator>& rhs) const RAPIDJSON_NOEXCEPT
+    {
+        return baseAllocator_ == rhs.baseAllocator_;
+    }
+    template<typename U>
+    bool operator!=(const StdAllocator<U, BaseAllocator>& rhs) const RAPIDJSON_NOEXCEPT
+    {
+        return !operator==(rhs);
+    }
+
+    //! rapidjson Allocator concept
+    static const bool kNeedFree = BaseAllocator::kNeedFree;
+    static const bool kRefCounted = internal::IsRefCounted<BaseAllocator>::Value;
+    void* Malloc(size_t size)
+    {
+        return baseAllocator_.Malloc(size);
+    }
+    void* Realloc(void* originalPtr, size_t originalSize, size_t newSize)
+    {
+        return baseAllocator_.Realloc(originalPtr, originalSize, newSize);
+    }
+    static void Free(void *ptr) RAPIDJSON_NOEXCEPT
+    {
+        BaseAllocator::Free(ptr);
+    }
+
+private:
+    template <typename, typename>
+    friend class StdAllocator; // access to StdAllocator<!T>.*
+
+    BaseAllocator baseAllocator_;
+};
+
+#if !RAPIDJSON_HAS_CXX17 // std::allocator<void> deprecated in C++17
+template <typename BaseAllocator>
+class StdAllocator<void, BaseAllocator> :
+    public std::allocator<void>
+{
+    typedef std::allocator<void> allocator_type;
+
+public:
+    typedef BaseAllocator BaseAllocatorType;
+
+    StdAllocator() RAPIDJSON_NOEXCEPT :
+        allocator_type(),
+        baseAllocator_()
+    { }
+
+    StdAllocator(const StdAllocator& rhs) RAPIDJSON_NOEXCEPT :
+        allocator_type(rhs),
+        baseAllocator_(rhs.baseAllocator_)
+    { }
+
+    template<typename U>
+    StdAllocator(const StdAllocator<U, BaseAllocator>& rhs) RAPIDJSON_NOEXCEPT :
+        allocator_type(rhs),
+        baseAllocator_(rhs.baseAllocator_)
+    { }
+
+    /* implicit */
+    StdAllocator(const BaseAllocator& baseAllocator) RAPIDJSON_NOEXCEPT :
+        allocator_type(),
+        baseAllocator_(baseAllocator)
+    { }
+
+    ~StdAllocator() RAPIDJSON_NOEXCEPT
+    { }
+
+    template<typename U>
+    struct rebind {
+        typedef StdAllocator<U, BaseAllocator> other;
+    };
+
+    typedef typename allocator_type::value_type value_type;
+
+private:
+    template <typename, typename>
+    friend class StdAllocator; // access to StdAllocator<!T>.*
+
+    BaseAllocator baseAllocator_;
+};
+#endif
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_POP
+#endif
+
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_ENCODINGS_H_
diff --git a/include/rapidjson/cursorstreamwrapper.h b/include/rapidjson/cursorstreamwrapper.h
new file mode 100644
index 0000000000..fd6513db14
--- /dev/null
+++ b/include/rapidjson/cursorstreamwrapper.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_CURSORSTREAMWRAPPER_H_
+#define RAPIDJSON_CURSORSTREAMWRAPPER_H_
+
+#include "stream.h"
+
+#if defined(__GNUC__)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++)
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(4702)  // unreachable code
+RAPIDJSON_DIAG_OFF(4512)  // assignment operator could not be generated
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+
+//! Cursor stream wrapper for counting line and column number if error exists.
+/*!
+    \tparam InputStream     Any stream that implements Stream Concept
+*/
+template <typename InputStream, typename Encoding = UTF8<> >
+class CursorStreamWrapper : public GenericStreamWrapper<InputStream, Encoding> {
+public:
+    typedef typename Encoding::Ch Ch;
+
+    CursorStreamWrapper(InputStream& is):
+        GenericStreamWrapper<InputStream, Encoding>(is), line_(1), col_(0) {}
+
+    // counting line and column number
+    Ch Take() {
+        Ch ch = this->is_.Take();
+        if(ch == '\n') {
+            line_ ++;
+            col_ = 0;
+        } else {
+            col_ ++;
+        }
+        return ch;
+    }
+
+    //! Get the error line number, if error exists.
+    size_t GetLine() const { return line_; }
+    //! Get the error column number, if error exists.
+    size_t GetColumn() const { return col_; }
+
+private:
+    size_t line_;   //!< Current Line
+    size_t col_;    //!< Current Column
+};
+
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+RAPIDJSON_DIAG_POP
+#endif
+
+#if defined(__GNUC__)
+RAPIDJSON_DIAG_POP
+#endif
+
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_CURSORSTREAMWRAPPER_H_
diff --git a/include/rapidjson/document.h b/include/rapidjson/document.h
new file mode 100644
index 0000000000..4b2d723224
--- /dev/null
+++ b/include/rapidjson/document.h
@@ -0,0 +1,3044 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_DOCUMENT_H_
+#define RAPIDJSON_DOCUMENT_H_
+
+/*! \file document.h */
+
+#include "reader.h"
+#include "internal/meta.h"
+#include "internal/strfunc.h"
+#include "memorystream.h"
+#include "encodedstream.h"
+#include <new>      // placement new
+#include <limits>
+#ifdef __cpp_lib_three_way_comparison
+#include <compare>
+#endif
+
+RAPIDJSON_DIAG_PUSH
+#ifdef __clang__
+RAPIDJSON_DIAG_OFF(padded)
+RAPIDJSON_DIAG_OFF(switch-enum)
+RAPIDJSON_DIAG_OFF(c++98-compat)
+#elif defined(_MSC_VER)
+RAPIDJSON_DIAG_OFF(4127) // conditional expression is constant
+RAPIDJSON_DIAG_OFF(4244) // conversion from kXxxFlags to 'uint16_t', possible loss of data
+#endif
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_OFF(effc++)
+#endif // __GNUC__
+
+#ifdef GetObject
+// see https://github.com/Tencent/rapidjson/issues/1448
+// a former included windows.h might have defined a macro called GetObject, which affects
+// GetObject defined here. This ensures the macro does not get applied
+#pragma push_macro("GetObject")
+#define RAPIDJSON_WINDOWS_GETOBJECT_WORKAROUND_APPLIED
+#undef GetObject
+#endif
+
+#ifndef RAPIDJSON_NOMEMBERITERATORCLASS
+#include <iterator> // std::random_access_iterator_tag
+#endif
+
+#if RAPIDJSON_USE_MEMBERSMAP
+#include <map> // std::multimap
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+// Forward declaration.
+template <typename Encoding, typename Allocator>
+class GenericValue;
+
+template <typename Encoding, typename Allocator, typename StackAllocator>
+class GenericDocument;
+
+/*! \def RAPIDJSON_DEFAULT_ALLOCATOR
+    \ingroup RAPIDJSON_CONFIG
+    \brief Allows to choose default allocator.
+
+    User can define this to use CrtAllocator or MemoryPoolAllocator.
+*/
+#ifndef RAPIDJSON_DEFAULT_ALLOCATOR
+#define RAPIDJSON_DEFAULT_ALLOCATOR ::RAPIDJSON_NAMESPACE::MemoryPoolAllocator<::RAPIDJSON_NAMESPACE::CrtAllocator>
+#endif
+
+/*! \def RAPIDJSON_DEFAULT_STACK_ALLOCATOR
+    \ingroup RAPIDJSON_CONFIG
+    \brief Allows to choose default stack allocator for Document.
+
+    User can define this to use CrtAllocator or MemoryPoolAllocator.
+*/
+#ifndef RAPIDJSON_DEFAULT_STACK_ALLOCATOR
+#define RAPIDJSON_DEFAULT_STACK_ALLOCATOR ::RAPIDJSON_NAMESPACE::CrtAllocator
+#endif
+
+/*! \def RAPIDJSON_VALUE_DEFAULT_OBJECT_CAPACITY
+    \ingroup RAPIDJSON_CONFIG
+    \brief User defined kDefaultObjectCapacity value.
+
+    User can define this as any natural number.
+*/
+#ifndef RAPIDJSON_VALUE_DEFAULT_OBJECT_CAPACITY
+// number of objects that rapidjson::Value allocates memory for by default
+#define RAPIDJSON_VALUE_DEFAULT_OBJECT_CAPACITY 16
+#endif
+
+/*! \def RAPIDJSON_VALUE_DEFAULT_ARRAY_CAPACITY
+    \ingroup RAPIDJSON_CONFIG
+    \brief User defined kDefaultArrayCapacity value.
+
+    User can define this as any natural number.
+*/
+#ifndef RAPIDJSON_VALUE_DEFAULT_ARRAY_CAPACITY
+// number of array elements that rapidjson::Value allocates memory for by default
+#define RAPIDJSON_VALUE_DEFAULT_ARRAY_CAPACITY 16
+#endif
+
+//! Name-value pair in a JSON object value.
+/*!
+    This class was internal to GenericValue. It used to be a inner struct.
+    But a compiler (IBM XL C/C++ for AIX) have reported to have problem with that so it moved as a namespace scope struct.
+    https://code.google.com/p/rapidjson/issues/detail?id=64
+*/
+template <typename Encoding, typename Allocator> 
+class GenericMember {
+public:
+    GenericValue<Encoding, Allocator> name;     //!< name of member (must be a string)
+    GenericValue<Encoding, Allocator> value;    //!< value of member.
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    //! Move constructor in C++11
+    GenericMember(GenericMember&& rhs) RAPIDJSON_NOEXCEPT
+        : name(std::move(rhs.name)),
+          value(std::move(rhs.value))
+    {
+    }
+
+    //! Move assignment in C++11
+    GenericMember& operator=(GenericMember&& rhs) RAPIDJSON_NOEXCEPT {
+        return *this = static_cast<GenericMember&>(rhs);
+    }
+#endif
+
+    //! Assignment with move semantics.
+    /*! \param rhs Source of the assignment. Its name and value will become a null value after assignment.
+    */
+    GenericMember& operator=(GenericMember& rhs) RAPIDJSON_NOEXCEPT {
+        if (RAPIDJSON_LIKELY(this != &rhs)) {
+            name = rhs.name;
+            value = rhs.value;
+        }
+        return *this;
+    }
+
+    // swap() for std::sort() and other potential use in STL.
+    friend inline void swap(GenericMember& a, GenericMember& b) RAPIDJSON_NOEXCEPT {
+        a.name.Swap(b.name);
+        a.value.Swap(b.value);
+    }
+
+private:
+    //! Copy constructor is not permitted.
+    GenericMember(const GenericMember& rhs);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericMemberIterator
+
+#ifndef RAPIDJSON_NOMEMBERITERATORCLASS
+
+//! (Constant) member iterator for a JSON object value
+/*!
+    \tparam Const Is this a constant iterator?
+    \tparam Encoding    Encoding of the value. (Even non-string values need to have the same encoding in a document)
+    \tparam Allocator   Allocator type for allocating memory of object, array and string.
+
+    This class implements a Random Access Iterator for GenericMember elements
+    of a GenericValue, see ISO/IEC 14882:2003(E) C++ standard, 24.1 [lib.iterator.requirements].
+
+    \note This iterator implementation is mainly intended to avoid implicit
+        conversions from iterator values to \c NULL,
+        e.g. from GenericValue::FindMember.
+
+    \note Define \c RAPIDJSON_NOMEMBERITERATORCLASS to fall back to a
+        pointer-based implementation, if your platform doesn't provide
+        the C++ <iterator> header.
+
+    \see GenericMember, GenericValue::MemberIterator, GenericValue::ConstMemberIterator
+ */
+template <bool Const, typename Encoding, typename Allocator>
+class GenericMemberIterator {
+
+    friend class GenericValue<Encoding,Allocator>;
+    template <bool, typename, typename> friend class GenericMemberIterator;
+
+    typedef GenericMember<Encoding,Allocator> PlainType;
+    typedef typename internal::MaybeAddConst<Const,PlainType>::Type ValueType;
+
+public:
+    //! Iterator type itself
+    typedef GenericMemberIterator Iterator;
+    //! Constant iterator type
+    typedef GenericMemberIterator<true,Encoding,Allocator>  ConstIterator;
+    //! Non-constant iterator type
+    typedef GenericMemberIterator<false,Encoding,Allocator> NonConstIterator;
+
+    /** \name std::iterator_traits support */
+    //@{
+    typedef ValueType      value_type;
+    typedef ValueType *    pointer;
+    typedef ValueType &    reference;
+    typedef std::ptrdiff_t difference_type;
+    typedef std::random_access_iterator_tag iterator_category;
+    //@}
+
+    //! Pointer to (const) GenericMember
+    typedef pointer         Pointer;
+    //! Reference to (const) GenericMember
+    typedef reference       Reference;
+    //! Signed integer type (e.g. \c ptrdiff_t)
+    typedef difference_type DifferenceType;
+
+    //! Default constructor (singular value)
+    /*! Creates an iterator pointing to no element.
+        \note All operations, except for comparisons, are undefined on such values.
+     */
+    GenericMemberIterator() : ptr_() {}
+
+    //! Iterator conversions to more const
+    /*!
+        \param it (Non-const) iterator to copy from
+
+        Allows the creation of an iterator from another GenericMemberIterator
+        that is "less const".  Especially, creating a non-constant iterator
+        from a constant iterator are disabled:
+        \li const -> non-const (not ok)
+        \li const -> const (ok)
+        \li non-const -> const (ok)
+        \li non-const -> non-const (ok)
+
+        \note If the \c Const template parameter is already \c false, this
+            constructor effectively defines a regular copy-constructor.
+            Otherwise, the copy constructor is implicitly defined.
+    */
+    GenericMemberIterator(const NonConstIterator & it) : ptr_(it.ptr_) {}
+    Iterator& operator=(const NonConstIterator & it) { ptr_ = it.ptr_; return *this; }
+
+    //! @name stepping
+    //@{
+    Iterator& operator++(){ ++ptr_; return *this; }
+    Iterator& operator--(){ --ptr_; return *this; }
+    Iterator  operator++(int){ Iterator old(*this); ++ptr_; return old; }
+    Iterator  operator--(int){ Iterator old(*this); --ptr_; return old; }
+    //@}
+
+    //! @name increment/decrement
+    //@{
+    Iterator operator+(DifferenceType n) const { return Iterator(ptr_+n); }
+    Iterator operator-(DifferenceType n) const { return Iterator(ptr_-n); }
+
+    Iterator& operator+=(DifferenceType n) { ptr_+=n; return *this; }
+    Iterator& operator-=(DifferenceType n) { ptr_-=n; return *this; }
+    //@}
+
+    //! @name relations
+    //@{
+    template <bool Const_> bool operator==(const GenericMemberIterator<Const_, Encoding, Allocator>& that) const { return ptr_ == that.ptr_; }
+    template <bool Const_> bool operator!=(const GenericMemberIterator<Const_, Encoding, Allocator>& that) const { return ptr_ != that.ptr_; }
+    template <bool Const_> bool operator<=(const GenericMemberIterator<Const_, Encoding, Allocator>& that) const { return ptr_ <= that.ptr_; }
+    template <bool Const_> bool operator>=(const GenericMemberIterator<Const_, Encoding, Allocator>& that) const { return ptr_ >= that.ptr_; }
+    template <bool Const_> bool operator< (const GenericMemberIterator<Const_, Encoding, Allocator>& that) const { return ptr_ < that.ptr_; }
+    template <bool Const_> bool operator> (const GenericMemberIterator<Const_, Encoding, Allocator>& that) const { return ptr_ > that.ptr_; }
+
+#ifdef __cpp_lib_three_way_comparison
+    template <bool Const_> std::strong_ordering operator<=>(const GenericMemberIterator<Const_, Encoding, Allocator>& that) const { return ptr_ <=> that.ptr_; }
+#endif
+    //@}
+
+    //! @name dereference
+    //@{
+    Reference operator*() const { return *ptr_; }
+    Pointer   operator->() const { return ptr_; }
+    Reference operator[](DifferenceType n) const { return ptr_[n]; }
+    //@}
+
+    //! Distance
+    DifferenceType operator-(ConstIterator that) const { return ptr_-that.ptr_; }
+
+private:
+    //! Internal constructor from plain pointer
+    explicit GenericMemberIterator(Pointer p) : ptr_(p) {}
+
+    Pointer ptr_; //!< raw pointer
+};
+
+#else // RAPIDJSON_NOMEMBERITERATORCLASS
+
+// class-based member iterator implementation disabled, use plain pointers
+
+template <bool Const, typename Encoding, typename Allocator>
+class GenericMemberIterator;
+
+//! non-const GenericMemberIterator
+template <typename Encoding, typename Allocator>
+class GenericMemberIterator<false,Encoding,Allocator> {
+public:
+    //! use plain pointer as iterator type
+    typedef GenericMember<Encoding,Allocator>* Iterator;
+};
+//! const GenericMemberIterator
+template <typename Encoding, typename Allocator>
+class GenericMemberIterator<true,Encoding,Allocator> {
+public:
+    //! use plain const pointer as iterator type
+    typedef const GenericMember<Encoding,Allocator>* Iterator;
+};
+
+#endif // RAPIDJSON_NOMEMBERITERATORCLASS
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericStringRef
+
+//! Reference to a constant string (not taking a copy)
+/*!
+    \tparam CharType character type of the string
+
+    This helper class is used to automatically infer constant string
+    references for string literals, especially from \c const \b (!)
+    character arrays.
+
+    The main use is for creating JSON string values without copying the
+    source string via an \ref Allocator.  This requires that the referenced
+    string pointers have a sufficient lifetime, which exceeds the lifetime
+    of the associated GenericValue.
+
+    \b Example
+    \code
+    Value v("foo");   // ok, no need to copy & calculate length
+    const char foo[] = "foo";
+    v.SetString(foo); // ok
+
+    const char* bar = foo;
+    // Value x(bar); // not ok, can't rely on bar's lifetime
+    Value x(StringRef(bar)); // lifetime explicitly guaranteed by user
+    Value y(StringRef(bar, 3));  // ok, explicitly pass length
+    \endcode
+
+    \see StringRef, GenericValue::SetString
+*/
+template<typename CharType>
+struct GenericStringRef {
+    typedef CharType Ch; //!< character type of the string
+
+    //! Create string reference from \c const character array
+#ifndef __clang__ // -Wdocumentation
+    /*!
+        This constructor implicitly creates a constant string reference from
+        a \c const character array.  It has better performance than
+        \ref StringRef(const CharType*) by inferring the string \ref length
+        from the array length, and also supports strings containing null
+        characters.
+
+        \tparam N length of the string, automatically inferred
+
+        \param str Constant character array, lifetime assumed to be longer
+            than the use of the string in e.g. a GenericValue
+
+        \post \ref s == str
+
+        \note Constant complexity.
+        \note There is a hidden, private overload to disallow references to
+            non-const character arrays to be created via this constructor.
+            By this, e.g. function-scope arrays used to be filled via
+            \c snprintf are excluded from consideration.
+            In such cases, the referenced string should be \b copied to the
+            GenericValue instead.
+     */
+#endif
+    template<SizeType N>
+    GenericStringRef(const CharType (&str)[N]) RAPIDJSON_NOEXCEPT
+        : s(str), length(N-1) {}
+
+    //! Explicitly create string reference from \c const character pointer
+#ifndef __clang__ // -Wdocumentation
+    /*!
+        This constructor can be used to \b explicitly  create a reference to
+        a constant string pointer.
+
+        \see StringRef(const CharType*)
+
+        \param str Constant character pointer, lifetime assumed to be longer
+            than the use of the string in e.g. a GenericValue
+
+        \post \ref s == str
+
+        \note There is a hidden, private overload to disallow references to
+            non-const character arrays to be created via this constructor.
+            By this, e.g. function-scope arrays used to be filled via
+            \c snprintf are excluded from consideration.
+            In such cases, the referenced string should be \b copied to the
+            GenericValue instead.
+     */
+#endif
+    explicit GenericStringRef(const CharType* str)
+        : s(str), length(NotNullStrLen(str)) {}
+
+    //! Create constant string reference from pointer and length
+#ifndef __clang__ // -Wdocumentation
+    /*! \param str constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue
+        \param len length of the string, excluding the trailing NULL terminator
+
+        \post \ref s == str && \ref length == len
+        \note Constant complexity.
+     */
+#endif
+    GenericStringRef(const CharType* str, SizeType len)
+        : s(RAPIDJSON_LIKELY(str) ? str : emptyString), length(len) { RAPIDJSON_ASSERT(str != 0 || len == 0u); }
+
+    GenericStringRef(const GenericStringRef& rhs) : s(rhs.s), length(rhs.length) {}
+
+    //! implicit conversion to plain CharType pointer
+    operator const Ch *() const { return s; }
+
+    const Ch* const s; //!< plain CharType pointer
+    const SizeType length; //!< length of the string (excluding the trailing NULL terminator)
+
+private:
+    SizeType NotNullStrLen(const CharType* str) {
+        RAPIDJSON_ASSERT(str != 0);
+        return internal::StrLen(str);
+    }
+
+    /// Empty string - used when passing in a NULL pointer
+    static const Ch emptyString[];
+
+    //! Disallow construction from non-const array
+    template<SizeType N>
+    GenericStringRef(CharType (&str)[N]) /* = delete */;
+    //! Copy assignment operator not permitted - immutable type
+    GenericStringRef& operator=(const GenericStringRef& rhs) /* = delete */;
+};
+
+template<typename CharType>
+const CharType GenericStringRef<CharType>::emptyString[] = { CharType() };
+
+//! Mark a character pointer as constant string
+/*! Mark a plain character pointer as a "string literal".  This function
+    can be used to avoid copying a character string to be referenced as a
+    value in a JSON GenericValue object, if the string's lifetime is known
+    to be valid long enough.
+    \tparam CharType Character type of the string
+    \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue
+    \return GenericStringRef string reference object
+    \relatesalso GenericStringRef
+
+    \see GenericValue::GenericValue(StringRefType), GenericValue::operator=(StringRefType), GenericValue::SetString(StringRefType), GenericValue::PushBack(StringRefType, Allocator&), GenericValue::AddMember
+*/
+template<typename CharType>
+inline GenericStringRef<CharType> StringRef(const CharType* str) {
+    return GenericStringRef<CharType>(str);
+}
+
+//! Mark a character pointer as constant string
+/*! Mark a plain character pointer as a "string literal".  This function
+    can be used to avoid copying a character string to be referenced as a
+    value in a JSON GenericValue object, if the string's lifetime is known
+    to be valid long enough.
+
+    This version has better performance with supplied length, and also
+    supports string containing null characters.
+
+    \tparam CharType character type of the string
+    \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue
+    \param length The length of source string.
+    \return GenericStringRef string reference object
+    \relatesalso GenericStringRef
+*/
+template<typename CharType>
+inline GenericStringRef<CharType> StringRef(const CharType* str, size_t length) {
+    return GenericStringRef<CharType>(str, SizeType(length));
+}
+
+#if RAPIDJSON_HAS_STDSTRING
+//! Mark a string object as constant string
+/*! Mark a string object (e.g. \c std::string) as a "string literal".
+    This function can be used to avoid copying a string to be referenced as a
+    value in a JSON GenericValue object, if the string's lifetime is known
+    to be valid long enough.
+
+    \tparam CharType character type of the string
+    \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue
+    \return GenericStringRef string reference object
+    \relatesalso GenericStringRef
+    \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING.
+*/
+template<typename CharType>
+inline GenericStringRef<CharType> StringRef(const std::basic_string<CharType>& str) {
+    return GenericStringRef<CharType>(str.data(), SizeType(str.size()));
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericValue type traits
+namespace internal {
+
+template <typename T, typename Encoding = void, typename Allocator = void>
+struct IsGenericValueImpl : FalseType {};
+
+// select candidates according to nested encoding and allocator types
+template <typename T> struct IsGenericValueImpl<T, typename Void<typename T::EncodingType>::Type, typename Void<typename T::AllocatorType>::Type>
+    : IsBaseOf<GenericValue<typename T::EncodingType, typename T::AllocatorType>, T>::Type {};
+
+// helper to match arbitrary GenericValue instantiations, including derived classes
+template <typename T> struct IsGenericValue : IsGenericValueImpl<T>::Type {};
+
+} // namespace internal
+
+///////////////////////////////////////////////////////////////////////////////
+// TypeHelper
+
+namespace internal {
+
+template <typename ValueType, typename T>
+struct TypeHelper {};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, bool> {
+    static bool Is(const ValueType& v) { return v.IsBool(); }
+    static bool Get(const ValueType& v) { return v.GetBool(); }
+    static ValueType& Set(ValueType& v, bool data) { return v.SetBool(data); }
+    static ValueType& Set(ValueType& v, bool data, typename ValueType::AllocatorType&) { return v.SetBool(data); }
+};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, int> {
+    static bool Is(const ValueType& v) { return v.IsInt(); }
+    static int Get(const ValueType& v) { return v.GetInt(); }
+    static ValueType& Set(ValueType& v, int data) { return v.SetInt(data); }
+    static ValueType& Set(ValueType& v, int data, typename ValueType::AllocatorType&) { return v.SetInt(data); }
+};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, unsigned> {
+    static bool Is(const ValueType& v) { return v.IsUint(); }
+    static unsigned Get(const ValueType& v) { return v.GetUint(); }
+    static ValueType& Set(ValueType& v, unsigned data) { return v.SetUint(data); }
+    static ValueType& Set(ValueType& v, unsigned data, typename ValueType::AllocatorType&) { return v.SetUint(data); }
+};
+
+#ifdef _MSC_VER
+RAPIDJSON_STATIC_ASSERT(sizeof(long) == sizeof(int));
+template<typename ValueType>
+struct TypeHelper<ValueType, long> {
+    static bool Is(const ValueType& v) { return v.IsInt(); }
+    static long Get(const ValueType& v) { return v.GetInt(); }
+    static ValueType& Set(ValueType& v, long data) { return v.SetInt(data); }
+    static ValueType& Set(ValueType& v, long data, typename ValueType::AllocatorType&) { return v.SetInt(data); }
+};
+
+RAPIDJSON_STATIC_ASSERT(sizeof(unsigned long) == sizeof(unsigned));
+template<typename ValueType>
+struct TypeHelper<ValueType, unsigned long> {
+    static bool Is(const ValueType& v) { return v.IsUint(); }
+    static unsigned long Get(const ValueType& v) { return v.GetUint(); }
+    static ValueType& Set(ValueType& v, unsigned long data) { return v.SetUint(data); }
+    static ValueType& Set(ValueType& v, unsigned long data, typename ValueType::AllocatorType&) { return v.SetUint(data); }
+};
+#endif
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, int64_t> {
+    static bool Is(const ValueType& v) { return v.IsInt64(); }
+    static int64_t Get(const ValueType& v) { return v.GetInt64(); }
+    static ValueType& Set(ValueType& v, int64_t data) { return v.SetInt64(data); }
+    static ValueType& Set(ValueType& v, int64_t data, typename ValueType::AllocatorType&) { return v.SetInt64(data); }
+};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, uint64_t> {
+    static bool Is(const ValueType& v) { return v.IsUint64(); }
+    static uint64_t Get(const ValueType& v) { return v.GetUint64(); }
+    static ValueType& Set(ValueType& v, uint64_t data) { return v.SetUint64(data); }
+    static ValueType& Set(ValueType& v, uint64_t data, typename ValueType::AllocatorType&) { return v.SetUint64(data); }
+};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, double> {
+    static bool Is(const ValueType& v) { return v.IsDouble(); }
+    static double Get(const ValueType& v) { return v.GetDouble(); }
+    static ValueType& Set(ValueType& v, double data) { return v.SetDouble(data); }
+    static ValueType& Set(ValueType& v, double data, typename ValueType::AllocatorType&) { return v.SetDouble(data); }
+};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, float> {
+    static bool Is(const ValueType& v) { return v.IsFloat(); }
+    static float Get(const ValueType& v) { return v.GetFloat(); }
+    static ValueType& Set(ValueType& v, float data) { return v.SetFloat(data); }
+    static ValueType& Set(ValueType& v, float data, typename ValueType::AllocatorType&) { return v.SetFloat(data); }
+};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, const typename ValueType::Ch*> {
+    typedef const typename ValueType::Ch* StringType;
+    static bool Is(const ValueType& v) { return v.IsString(); }
+    static StringType Get(const ValueType& v) { return v.GetString(); }
+    static ValueType& Set(ValueType& v, const StringType data) { return v.SetString(typename ValueType::StringRefType(data)); }
+    static ValueType& Set(ValueType& v, const StringType data, typename ValueType::AllocatorType& a) { return v.SetString(data, a); }
+};
+
+#if RAPIDJSON_HAS_STDSTRING
+template<typename ValueType> 
+struct TypeHelper<ValueType, std::basic_string<typename ValueType::Ch> > {
+    typedef std::basic_string<typename ValueType::Ch> StringType;
+    static bool Is(const ValueType& v) { return v.IsString(); }
+    static StringType Get(const ValueType& v) { return StringType(v.GetString(), v.GetStringLength()); }
+    static ValueType& Set(ValueType& v, const StringType& data, typename ValueType::AllocatorType& a) { return v.SetString(data, a); }
+};
+#endif
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, typename ValueType::Array> {
+    typedef typename ValueType::Array ArrayType;
+    static bool Is(const ValueType& v) { return v.IsArray(); }
+    static ArrayType Get(ValueType& v) { return v.GetArray(); }
+    static ValueType& Set(ValueType& v, ArrayType data) { return v = data; }
+    static ValueType& Set(ValueType& v, ArrayType data, typename ValueType::AllocatorType&) { return v = data; }
+};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, typename ValueType::ConstArray> {
+    typedef typename ValueType::ConstArray ArrayType;
+    static bool Is(const ValueType& v) { return v.IsArray(); }
+    static ArrayType Get(const ValueType& v) { return v.GetArray(); }
+};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, typename ValueType::Object> {
+    typedef typename ValueType::Object ObjectType;
+    static bool Is(const ValueType& v) { return v.IsObject(); }
+    static ObjectType Get(ValueType& v) { return v.GetObject(); }
+    static ValueType& Set(ValueType& v, ObjectType data) { return v = data; }
+    static ValueType& Set(ValueType& v, ObjectType data, typename ValueType::AllocatorType&) { return v = data; }
+};
+
+template<typename ValueType> 
+struct TypeHelper<ValueType, typename ValueType::ConstObject> {
+    typedef typename ValueType::ConstObject ObjectType;
+    static bool Is(const ValueType& v) { return v.IsObject(); }
+    static ObjectType Get(const ValueType& v) { return v.GetObject(); }
+};
+
+} // namespace internal
+
+// Forward declarations
+template <bool, typename> class GenericArray;
+template <bool, typename> class GenericObject;
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericValue
+
+//! Represents a JSON value. Use Value for UTF8 encoding and default allocator.
+/*!
+    A JSON value can be one of 7 types. This class is a variant type supporting
+    these types.
+
+    Use the Value if UTF8 and default allocator
+
+    \tparam Encoding    Encoding of the value. (Even non-string values need to have the same encoding in a document)
+    \tparam Allocator   Allocator type for allocating memory of object, array and string.
+*/
+template <typename Encoding, typename Allocator = RAPIDJSON_DEFAULT_ALLOCATOR >
+class GenericValue {
+public:
+    //! Name-value pair in an object.
+    typedef GenericMember<Encoding, Allocator> Member;
+    typedef Encoding EncodingType;                  //!< Encoding type from template parameter.
+    typedef Allocator AllocatorType;                //!< Allocator type from template parameter.
+    typedef typename Encoding::Ch Ch;               //!< Character type derived from Encoding.
+    typedef GenericStringRef<Ch> StringRefType;     //!< Reference to a constant string
+    typedef typename GenericMemberIterator<false,Encoding,Allocator>::Iterator MemberIterator;  //!< Member iterator for iterating in object.
+    typedef typename GenericMemberIterator<true,Encoding,Allocator>::Iterator ConstMemberIterator;  //!< Constant member iterator for iterating in object.
+    typedef GenericValue* ValueIterator;            //!< Value iterator for iterating in array.
+    typedef const GenericValue* ConstValueIterator; //!< Constant value iterator for iterating in array.
+    typedef GenericValue<Encoding, Allocator> ValueType;    //!< Value type of itself.
+    typedef GenericArray<false, ValueType> Array;
+    typedef GenericArray<true, ValueType> ConstArray;
+    typedef GenericObject<false, ValueType> Object;
+    typedef GenericObject<true, ValueType> ConstObject;
+
+    //!@name Constructors and destructor.
+    //@{
+
+    //! Default constructor creates a null value.
+    GenericValue() RAPIDJSON_NOEXCEPT : data_() { data_.f.flags = kNullFlag; }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    //! Move constructor in C++11
+    GenericValue(GenericValue&& rhs) RAPIDJSON_NOEXCEPT : data_(rhs.data_) {
+        rhs.data_.f.flags = kNullFlag; // give up contents
+    }
+#endif
+
+private:
+    //! Copy constructor is not permitted.
+    GenericValue(const GenericValue& rhs);
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    //! Moving from a GenericDocument is not permitted.
+    template <typename StackAllocator>
+    GenericValue(GenericDocument<Encoding,Allocator,StackAllocator>&& rhs);
+
+    //! Move assignment from a GenericDocument is not permitted.
+    template <typename StackAllocator>
+    GenericValue& operator=(GenericDocument<Encoding,Allocator,StackAllocator>&& rhs);
+#endif
+
+public:
+
+    //! Constructor with JSON value type.
+    /*! This creates a Value of specified type with default content.
+        \param type Type of the value.
+        \note Default content for number is zero.
+    */
+    explicit GenericValue(Type type) RAPIDJSON_NOEXCEPT : data_() {
+        static const uint16_t defaultFlags[] = {
+            kNullFlag, kFalseFlag, kTrueFlag, kObjectFlag, kArrayFlag, kShortStringFlag,
+            kNumberAnyFlag
+        };
+        RAPIDJSON_NOEXCEPT_ASSERT(type >= kNullType && type <= kNumberType);
+        data_.f.flags = defaultFlags[type];
+
+        // Use ShortString to store empty string.
+        if (type == kStringType)
+            data_.ss.SetLength(0);
+    }
+
+    //! Explicit copy constructor (with allocator)
+    /*! Creates a copy of a Value by using the given Allocator
+        \tparam SourceAllocator allocator of \c rhs
+        \param rhs Value to copy from (read-only)
+        \param allocator Allocator for allocating copied elements and buffers. Commonly use GenericDocument::GetAllocator().
+        \param copyConstStrings Force copying of constant strings (e.g. referencing an in-situ buffer)
+        \see CopyFrom()
+    */
+    template <typename SourceAllocator>
+    GenericValue(const GenericValue<Encoding,SourceAllocator>& rhs, Allocator& allocator, bool copyConstStrings = false) {
+        switch (rhs.GetType()) {
+        case kObjectType:
+            DoCopyMembers(rhs, allocator, copyConstStrings);
+            break;
+        case kArrayType: {
+                SizeType count = rhs.data_.a.size;
+                GenericValue* le = reinterpret_cast<GenericValue*>(allocator.Malloc(count * sizeof(GenericValue)));
+                const GenericValue<Encoding,SourceAllocator>* re = rhs.GetElementsPointer();
+                for (SizeType i = 0; i < count; i++)
+                    new (&le[i]) GenericValue(re[i], allocator, copyConstStrings);
+                data_.f.flags = kArrayFlag;
+                data_.a.size = data_.a.capacity = count;
+                SetElementsPointer(le);
+            }
+            break;
+        case kStringType:
+            if (rhs.data_.f.flags == kConstStringFlag && !copyConstStrings) {
+                data_.f.flags = rhs.data_.f.flags;
+                data_  = *reinterpret_cast<const Data*>(&rhs.data_);
+            }
+            else
+                SetStringRaw(StringRef(rhs.GetString(), rhs.GetStringLength()), allocator);
+            break;
+        default:
+            data_.f.flags = rhs.data_.f.flags;
+            data_  = *reinterpret_cast<const Data*>(&rhs.data_);
+            break;
+        }
+    }
+
+    //! Constructor for boolean value.
+    /*! \param b Boolean value
+        \note This constructor is limited to \em real boolean values and rejects
+            implicitly converted types like arbitrary pointers.  Use an explicit cast
+            to \c bool, if you want to construct a boolean JSON value in such cases.
+     */
+#ifndef RAPIDJSON_DOXYGEN_RUNNING // hide SFINAE from Doxygen
+    template <typename T>
+    explicit GenericValue(T b, RAPIDJSON_ENABLEIF((internal::IsSame<bool, T>))) RAPIDJSON_NOEXCEPT  // See #472
+#else
+    explicit GenericValue(bool b) RAPIDJSON_NOEXCEPT
+#endif
+        : data_() {
+            // safe-guard against failing SFINAE
+            RAPIDJSON_STATIC_ASSERT((internal::IsSame<bool,T>::Value));
+            data_.f.flags = b ? kTrueFlag : kFalseFlag;
+    }
+
+    //! Constructor for int value.
+    explicit GenericValue(int i) RAPIDJSON_NOEXCEPT : data_() {
+        data_.n.i64 = i;
+        data_.f.flags = (i >= 0) ? (kNumberIntFlag | kUintFlag | kUint64Flag) : kNumberIntFlag;
+    }
+
+    //! Constructor for unsigned value.
+    explicit GenericValue(unsigned u) RAPIDJSON_NOEXCEPT : data_() {
+        data_.n.u64 = u; 
+        data_.f.flags = (u & 0x80000000) ? kNumberUintFlag : (kNumberUintFlag | kIntFlag | kInt64Flag);
+    }
+
+    //! Constructor for int64_t value.
+    explicit GenericValue(int64_t i64) RAPIDJSON_NOEXCEPT : data_() {
+        data_.n.i64 = i64;
+        data_.f.flags = kNumberInt64Flag;
+        if (i64 >= 0) {
+            data_.f.flags |= kNumberUint64Flag;
+            if (!(static_cast<uint64_t>(i64) & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x00000000)))
+                data_.f.flags |= kUintFlag;
+            if (!(static_cast<uint64_t>(i64) & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x80000000)))
+                data_.f.flags |= kIntFlag;
+        }
+        else if (i64 >= static_cast<int64_t>(RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x80000000)))
+            data_.f.flags |= kIntFlag;
+    }
+
+    //! Constructor for uint64_t value.
+    explicit GenericValue(uint64_t u64) RAPIDJSON_NOEXCEPT : data_() {
+        data_.n.u64 = u64;
+        data_.f.flags = kNumberUint64Flag;
+        if (!(u64 & RAPIDJSON_UINT64_C2(0x80000000, 0x00000000)))
+            data_.f.flags |= kInt64Flag;
+        if (!(u64 & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x00000000)))
+            data_.f.flags |= kUintFlag;
+        if (!(u64 & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x80000000)))
+            data_.f.flags |= kIntFlag;
+    }
+
+    //! Constructor for double value.
+    explicit GenericValue(double d) RAPIDJSON_NOEXCEPT : data_() { data_.n.d = d; data_.f.flags = kNumberDoubleFlag; }
+
+    //! Constructor for float value.
+    explicit GenericValue(float f) RAPIDJSON_NOEXCEPT : data_() { data_.n.d = static_cast<double>(f); data_.f.flags = kNumberDoubleFlag; }
+
+    //! Constructor for constant string (i.e. do not make a copy of string)
+    GenericValue(const Ch* s, SizeType length) RAPIDJSON_NOEXCEPT : data_() { SetStringRaw(StringRef(s, length)); }
+
+    //! Constructor for constant string (i.e. do not make a copy of string)
+    explicit GenericValue(StringRefType s) RAPIDJSON_NOEXCEPT : data_() { SetStringRaw(s); }
+
+    //! Constructor for copy-string (i.e. do make a copy of string)
+    GenericValue(const Ch* s, SizeType length, Allocator& allocator) : data_() { SetStringRaw(StringRef(s, length), allocator); }
+
+    //! Constructor for copy-string (i.e. do make a copy of string)
+    GenericValue(const Ch*s, Allocator& allocator) : data_() { SetStringRaw(StringRef(s), allocator); }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Constructor for copy-string from a string object (i.e. do make a copy of string)
+    /*! \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING.
+     */
+    GenericValue(const std::basic_string<Ch>& s, Allocator& allocator) : data_() { SetStringRaw(StringRef(s), allocator); }
+#endif
+
+    //! Constructor for Array.
+    /*!
+        \param a An array obtained by \c GetArray().
+        \note \c Array is always pass-by-value.
+        \note the source array is moved into this value and the sourec array becomes empty.
+    */
+    GenericValue(Array a) RAPIDJSON_NOEXCEPT : data_(a.value_.data_) {
+        a.value_.data_ = Data();
+        a.value_.data_.f.flags = kArrayFlag;
+    }
+
+    //! Constructor for Object.
+    /*!
+        \param o An object obtained by \c GetObject().
+        \note \c Object is always pass-by-value.
+        \note the source object is moved into this value and the sourec object becomes empty.
+    */
+    GenericValue(Object o) RAPIDJSON_NOEXCEPT : data_(o.value_.data_) {
+        o.value_.data_ = Data();
+        o.value_.data_.f.flags = kObjectFlag;
+    }
+
+    //! Destructor.
+    /*! Need to destruct elements of array, members of object, or copy-string.
+    */
+    ~GenericValue() {
+        // With RAPIDJSON_USE_MEMBERSMAP, the maps need to be destroyed to release
+        // their Allocator if it's refcounted (e.g. MemoryPoolAllocator).
+        if (Allocator::kNeedFree || (RAPIDJSON_USE_MEMBERSMAP+0 &&
+                                     internal::IsRefCounted<Allocator>::Value)) {
+            switch(data_.f.flags) {
+            case kArrayFlag:
+                {
+                    GenericValue* e = GetElementsPointer();
+                    for (GenericValue* v = e; v != e + data_.a.size; ++v)
+                        v->~GenericValue();
+                    if (Allocator::kNeedFree) { // Shortcut by Allocator's trait
+                        Allocator::Free(e);
+                    }
+                }
+                break;
+
+            case kObjectFlag:
+                DoFreeMembers();
+                break;
+
+            case kCopyStringFlag:
+                if (Allocator::kNeedFree) { // Shortcut by Allocator's trait
+                    Allocator::Free(const_cast<Ch*>(GetStringPointer()));
+                }
+                break;
+
+            default:
+                break;  // Do nothing for other types.
+            }
+        }
+    }
+
+    //@}
+
+    //!@name Assignment operators
+    //@{
+
+    //! Assignment with move semantics.
+    /*! \param rhs Source of the assignment. It will become a null value after assignment.
+    */
+    GenericValue& operator=(GenericValue& rhs) RAPIDJSON_NOEXCEPT {
+        if (RAPIDJSON_LIKELY(this != &rhs)) {
+            // Can't destroy "this" before assigning "rhs", otherwise "rhs"
+            // could be used after free if it's an sub-Value of "this",
+            // hence the temporary danse.
+            GenericValue temp;
+            temp.RawAssign(rhs);
+            this->~GenericValue();
+            RawAssign(temp);
+        }
+        return *this;
+    }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    //! Move assignment in C++11
+    GenericValue& operator=(GenericValue&& rhs) RAPIDJSON_NOEXCEPT {
+        return *this = rhs.Move();
+    }
+#endif
+
+    //! Assignment of constant string reference (no copy)
+    /*! \param str Constant string reference to be assigned
+        \note This overload is needed to avoid clashes with the generic primitive type assignment overload below.
+        \see GenericStringRef, operator=(T)
+    */
+    GenericValue& operator=(StringRefType str) RAPIDJSON_NOEXCEPT {
+        GenericValue s(str);
+        return *this = s;
+    }
+
+    //! Assignment with primitive types.
+    /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t
+        \param value The value to be assigned.
+
+        \note The source type \c T explicitly disallows all pointer types,
+            especially (\c const) \ref Ch*.  This helps avoiding implicitly
+            referencing character strings with insufficient lifetime, use
+            \ref SetString(const Ch*, Allocator&) (for copying) or
+            \ref StringRef() (to explicitly mark the pointer as constant) instead.
+            All other pointer types would implicitly convert to \c bool,
+            use \ref SetBool() instead.
+    */
+    template <typename T>
+    RAPIDJSON_DISABLEIF_RETURN((internal::IsPointer<T>), (GenericValue&))
+    operator=(T value) {
+        GenericValue v(value);
+        return *this = v;
+    }
+
+    //! Deep-copy assignment from Value
+    /*! Assigns a \b copy of the Value to the current Value object
+        \tparam SourceAllocator Allocator type of \c rhs
+        \param rhs Value to copy from (read-only)
+        \param allocator Allocator to use for copying
+        \param copyConstStrings Force copying of constant strings (e.g. referencing an in-situ buffer)
+     */
+    template <typename SourceAllocator>
+    GenericValue& CopyFrom(const GenericValue<Encoding, SourceAllocator>& rhs, Allocator& allocator, bool copyConstStrings = false) {
+        RAPIDJSON_ASSERT(static_cast<void*>(this) != static_cast<void const*>(&rhs));
+        this->~GenericValue();
+        new (this) GenericValue(rhs, allocator, copyConstStrings);
+        return *this;
+    }
+
+    //! Exchange the contents of this value with those of other.
+    /*!
+        \param other Another value.
+        \note Constant complexity.
+    */
+    GenericValue& Swap(GenericValue& other) RAPIDJSON_NOEXCEPT {
+        GenericValue temp;
+        temp.RawAssign(*this);
+        RawAssign(other);
+        other.RawAssign(temp);
+        return *this;
+    }
+
+    //! free-standing swap function helper
+    /*!
+        Helper function to enable support for common swap implementation pattern based on \c std::swap:
+        \code
+        void swap(MyClass& a, MyClass& b) {
+            using std::swap;
+            swap(a.value, b.value);
+            // ...
+        }
+        \endcode
+        \see Swap()
+     */
+    friend inline void swap(GenericValue& a, GenericValue& b) RAPIDJSON_NOEXCEPT { a.Swap(b); }
+
+    //! Prepare Value for move semantics
+    /*! \return *this */
+    GenericValue& Move() RAPIDJSON_NOEXCEPT { return *this; }
+    //@}
+
+    //!@name Equal-to and not-equal-to operators
+    //@{
+    //! Equal-to operator
+    /*!
+        \note If an object contains duplicated named member, comparing equality with any object is always \c false.
+        \note Complexity is quadratic in Object's member number and linear for the rest (number of all values in the subtree and total lengths of all strings).
+    */
+    template <typename SourceAllocator>
+    bool operator==(const GenericValue<Encoding, SourceAllocator>& rhs) const {
+        typedef GenericValue<Encoding, SourceAllocator> RhsType;
+        if (GetType() != rhs.GetType())
+            return false;
+
+        switch (GetType()) {
+        case kObjectType: // Warning: O(n^2) inner-loop
+            if (data_.o.size != rhs.data_.o.size)
+                return false;           
+            for (ConstMemberIterator lhsMemberItr = MemberBegin(); lhsMemberItr != MemberEnd(); ++lhsMemberItr) {
+                typename RhsType::ConstMemberIterator rhsMemberItr = rhs.FindMember(lhsMemberItr->name);
+                if (rhsMemberItr == rhs.MemberEnd() || (!(lhsMemberItr->value == rhsMemberItr->value)))
+                    return false;
+            }
+            return true;
+            
+        case kArrayType:
+            if (data_.a.size != rhs.data_.a.size)
+                return false;
+            for (SizeType i = 0; i < data_.a.size; i++)
+                if (!((*this)[i] == rhs[i]))
+                    return false;
+            return true;
+
+        case kStringType:
+            return StringEqual(rhs);
+
+        case kNumberType:
+            if (IsDouble() || rhs.IsDouble()) {
+                double a = GetDouble();     // May convert from integer to double.
+                double b = rhs.GetDouble(); // Ditto
+                return a >= b && a <= b;    // Prevent -Wfloat-equal
+            }
+            else
+                return data_.n.u64 == rhs.data_.n.u64;
+
+        default:
+            return true;
+        }
+    }
+
+    //! Equal-to operator with const C-string pointer
+    bool operator==(const Ch* rhs) const { return *this == GenericValue(StringRef(rhs)); }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Equal-to operator with string object
+    /*! \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING.
+     */
+    bool operator==(const std::basic_string<Ch>& rhs) const { return *this == GenericValue(StringRef(rhs)); }
+#endif
+
+    //! Equal-to operator with primitive types
+    /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c double, \c true, \c false
+    */
+    template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>,internal::IsGenericValue<T> >), (bool)) operator==(const T& rhs) const { return *this == GenericValue(rhs); }
+
+#ifndef __cpp_impl_three_way_comparison
+    //! Not-equal-to operator
+    /*! \return !(*this == rhs)
+     */
+    template <typename SourceAllocator>
+    bool operator!=(const GenericValue<Encoding, SourceAllocator>& rhs) const { return !(*this == rhs); }
+
+    //! Not-equal-to operator with const C-string pointer
+    bool operator!=(const Ch* rhs) const { return !(*this == rhs); }
+
+    //! Not-equal-to operator with arbitrary types
+    /*! \return !(*this == rhs)
+     */
+    template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::IsGenericValue<T>), (bool)) operator!=(const T& rhs) const { return !(*this == rhs); }
+
+    //! Equal-to operator with arbitrary types (symmetric version)
+    /*! \return (rhs == lhs)
+     */
+    template <typename T> friend RAPIDJSON_DISABLEIF_RETURN((internal::IsGenericValue<T>), (bool)) operator==(const T& lhs, const GenericValue& rhs) { return rhs == lhs; }
+
+    //! Not-Equal-to operator with arbitrary types (symmetric version)
+    /*! \return !(rhs == lhs)
+     */
+    template <typename T> friend RAPIDJSON_DISABLEIF_RETURN((internal::IsGenericValue<T>), (bool)) operator!=(const T& lhs, const GenericValue& rhs) { return !(rhs == lhs); }
+    //@}
+#endif
+
+    //!@name Type
+    //@{
+
+    Type GetType()  const { return static_cast<Type>(data_.f.flags & kTypeMask); }
+    bool IsNull()   const { return data_.f.flags == kNullFlag; }
+    bool IsFalse()  const { return data_.f.flags == kFalseFlag; }
+    bool IsTrue()   const { return data_.f.flags == kTrueFlag; }
+    bool IsBool()   const { return (data_.f.flags & kBoolFlag) != 0; }
+    bool IsObject() const { return data_.f.flags == kObjectFlag; }
+    bool IsArray()  const { return data_.f.flags == kArrayFlag; }
+    bool IsNumber() const { return (data_.f.flags & kNumberFlag) != 0; }
+    bool IsInt()    const { return (data_.f.flags & kIntFlag) != 0; }
+    bool IsUint()   const { return (data_.f.flags & kUintFlag) != 0; }
+    bool IsInt64()  const { return (data_.f.flags & kInt64Flag) != 0; }
+    bool IsUint64() const { return (data_.f.flags & kUint64Flag) != 0; }
+    bool IsDouble() const { return (data_.f.flags & kDoubleFlag) != 0; }
+    bool IsString() const { return (data_.f.flags & kStringFlag) != 0; }
+
+    // Checks whether a number can be losslessly converted to a double.
+    bool IsLosslessDouble() const {
+        if (!IsNumber()) return false;
+        if (IsUint64()) {
+            uint64_t u = GetUint64();
+            volatile double d = static_cast<double>(u);
+            return (d >= 0.0)
+                && (d < static_cast<double>((std::numeric_limits<uint64_t>::max)()))
+                && (u == static_cast<uint64_t>(d));
+        }
+        if (IsInt64()) {
+            int64_t i = GetInt64();
+            volatile double d = static_cast<double>(i);
+            return (d >= static_cast<double>((std::numeric_limits<int64_t>::min)()))
+                && (d < static_cast<double>((std::numeric_limits<int64_t>::max)()))
+                && (i == static_cast<int64_t>(d));
+        }
+        return true; // double, int, uint are always lossless
+    }
+
+    // Checks whether a number is a float (possible lossy).
+    bool IsFloat() const  {
+        if ((data_.f.flags & kDoubleFlag) == 0)
+            return false;
+        double d = GetDouble();
+        return d >= -3.4028234e38 && d <= 3.4028234e38;
+    }
+    // Checks whether a number can be losslessly converted to a float.
+    bool IsLosslessFloat() const {
+        if (!IsNumber()) return false;
+        double a = GetDouble();
+        if (a < static_cast<double>(-(std::numeric_limits<float>::max)())
+                || a > static_cast<double>((std::numeric_limits<float>::max)()))
+            return false;
+        double b = static_cast<double>(static_cast<float>(a));
+        return a >= b && a <= b;    // Prevent -Wfloat-equal
+    }
+
+    //@}
+
+    //!@name Null
+    //@{
+
+    GenericValue& SetNull() { this->~GenericValue(); new (this) GenericValue(); return *this; }
+
+    //@}
+
+    //!@name Bool
+    //@{
+
+    bool GetBool() const { RAPIDJSON_ASSERT(IsBool()); return data_.f.flags == kTrueFlag; }
+    //!< Set boolean value
+    /*! \post IsBool() == true */
+    GenericValue& SetBool(bool b) { this->~GenericValue(); new (this) GenericValue(b); return *this; }
+
+    //@}
+
+    //!@name Object
+    //@{
+
+    //! Set this value as an empty object.
+    /*! \post IsObject() == true */
+    GenericValue& SetObject() { this->~GenericValue(); new (this) GenericValue(kObjectType); return *this; }
+
+    //! Get the number of members in the object.
+    SizeType MemberCount() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.size; }
+
+    //! Get the capacity of object.
+    SizeType MemberCapacity() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.capacity; }
+
+    //! Check whether the object is empty.
+    bool ObjectEmpty() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.size == 0; }
+
+    //! Get a value from an object associated with the name.
+    /*! \pre IsObject() == true
+        \tparam T Either \c Ch or \c const \c Ch (template used for disambiguation with \ref operator[](SizeType))
+        \note In version 0.1x, if the member is not found, this function returns a null value. This makes issue 7.
+        Since 0.2, if the name is not correct, it will assert.
+        If user is unsure whether a member exists, user should use HasMember() first.
+        A better approach is to use FindMember().
+        \note Linear time complexity.
+    */
+    template <typename T>
+    RAPIDJSON_DISABLEIF_RETURN((internal::NotExpr<internal::IsSame<typename internal::RemoveConst<T>::Type, Ch> >),(GenericValue&)) operator[](T* name) {
+        GenericValue n(StringRef(name));
+        return (*this)[n];
+    }
+    template <typename T>
+    RAPIDJSON_DISABLEIF_RETURN((internal::NotExpr<internal::IsSame<typename internal::RemoveConst<T>::Type, Ch> >),(const GenericValue&)) operator[](T* name) const { return const_cast<GenericValue&>(*this)[name]; }
+
+    //! Get a value from an object associated with the name.
+    /*! \pre IsObject() == true
+        \tparam SourceAllocator Allocator of the \c name value
+
+        \note Compared to \ref operator[](T*), this version is faster because it does not need a StrLen().
+        And it can also handle strings with embedded null characters.
+
+        \note Linear time complexity.
+    */
+    template <typename SourceAllocator>
+    GenericValue& operator[](const GenericValue<Encoding, SourceAllocator>& name) {
+        MemberIterator member = FindMember(name);
+        if (member != MemberEnd())
+            return member->value;
+        else {
+            RAPIDJSON_ASSERT(false);    // see above note
+
+#if RAPIDJSON_HAS_CXX11
+            // Use thread-local storage to prevent races between threads.
+            // Use static buffer and placement-new to prevent destruction, with
+            // alignas() to ensure proper alignment.
+            alignas(GenericValue) thread_local static char buffer[sizeof(GenericValue)];
+            return *new (buffer) GenericValue();
+#elif defined(_MSC_VER) && _MSC_VER < 1900
+            // There's no way to solve both thread locality and proper alignment
+            // simultaneously.
+            __declspec(thread) static char buffer[sizeof(GenericValue)];
+            return *new (buffer) GenericValue();
+#elif defined(__GNUC__) || defined(__clang__)
+            // This will generate -Wexit-time-destructors in clang, but that's
+            // better than having under-alignment.
+            __thread static GenericValue buffer;
+            return buffer;
+#else
+            // Don't know what compiler this is, so don't know how to ensure
+            // thread-locality.
+            static GenericValue buffer;
+            return buffer;
+#endif
+        }
+    }
+    template <typename SourceAllocator>
+    const GenericValue& operator[](const GenericValue<Encoding, SourceAllocator>& name) const { return const_cast<GenericValue&>(*this)[name]; }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Get a value from an object associated with name (string object).
+    GenericValue& operator[](const std::basic_string<Ch>& name) { return (*this)[GenericValue(StringRef(name))]; }
+    const GenericValue& operator[](const std::basic_string<Ch>& name) const { return (*this)[GenericValue(StringRef(name))]; }
+#endif
+
+    //! Const member iterator
+    /*! \pre IsObject() == true */
+    ConstMemberIterator MemberBegin() const { RAPIDJSON_ASSERT(IsObject()); return ConstMemberIterator(GetMembersPointer()); }
+    //! Const \em past-the-end member iterator
+    /*! \pre IsObject() == true */
+    ConstMemberIterator MemberEnd() const   { RAPIDJSON_ASSERT(IsObject()); return ConstMemberIterator(GetMembersPointer() + data_.o.size); }
+    //! Member iterator
+    /*! \pre IsObject() == true */
+    MemberIterator MemberBegin()            { RAPIDJSON_ASSERT(IsObject()); return MemberIterator(GetMembersPointer()); }
+    //! \em Past-the-end member iterator
+    /*! \pre IsObject() == true */
+    MemberIterator MemberEnd()              { RAPIDJSON_ASSERT(IsObject()); return MemberIterator(GetMembersPointer() + data_.o.size); }
+
+    //! Request the object to have enough capacity to store members.
+    /*! \param newCapacity  The capacity that the object at least need to have.
+        \param allocator    Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \note Linear time complexity.
+    */
+    GenericValue& MemberReserve(SizeType newCapacity, Allocator &allocator) {
+        RAPIDJSON_ASSERT(IsObject());
+        DoReserveMembers(newCapacity, allocator);
+        return *this;
+    }
+
+    //! Check whether a member exists in the object.
+    /*!
+        \param name Member name to be searched.
+        \pre IsObject() == true
+        \return Whether a member with that name exists.
+        \note It is better to use FindMember() directly if you need the obtain the value as well.
+        \note Linear time complexity.
+    */
+    bool HasMember(const Ch* name) const { return FindMember(name) != MemberEnd(); }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Check whether a member exists in the object with string object.
+    /*!
+        \param name Member name to be searched.
+        \pre IsObject() == true
+        \return Whether a member with that name exists.
+        \note It is better to use FindMember() directly if you need the obtain the value as well.
+        \note Linear time complexity.
+    */
+    bool HasMember(const std::basic_string<Ch>& name) const { return FindMember(name) != MemberEnd(); }
+#endif
+
+    //! Check whether a member exists in the object with GenericValue name.
+    /*!
+        This version is faster because it does not need a StrLen(). It can also handle string with null character.
+        \param name Member name to be searched.
+        \pre IsObject() == true
+        \return Whether a member with that name exists.
+        \note It is better to use FindMember() directly if you need the obtain the value as well.
+        \note Linear time complexity.
+    */
+    template <typename SourceAllocator>
+    bool HasMember(const GenericValue<Encoding, SourceAllocator>& name) const { return FindMember(name) != MemberEnd(); }
+
+    //! Find member by name.
+    /*!
+        \param name Member name to be searched.
+        \pre IsObject() == true
+        \return Iterator to member, if it exists.
+            Otherwise returns \ref MemberEnd().
+
+        \note Earlier versions of Rapidjson returned a \c NULL pointer, in case
+            the requested member doesn't exist. For consistency with e.g.
+            \c std::map, this has been changed to MemberEnd() now.
+        \note Linear time complexity.
+    */
+    MemberIterator FindMember(const Ch* name) {
+        GenericValue n(StringRef(name));
+        return FindMember(n);
+    }
+
+    ConstMemberIterator FindMember(const Ch* name) const { return const_cast<GenericValue&>(*this).FindMember(name); }
+
+    //! Find member by name.
+    /*!
+        This version is faster because it does not need a StrLen(). It can also handle string with null character.
+        \param name Member name to be searched.
+        \pre IsObject() == true
+        \return Iterator to member, if it exists.
+            Otherwise returns \ref MemberEnd().
+
+        \note Earlier versions of Rapidjson returned a \c NULL pointer, in case
+            the requested member doesn't exist. For consistency with e.g.
+            \c std::map, this has been changed to MemberEnd() now.
+        \note Linear time complexity.
+    */
+    template <typename SourceAllocator>
+    MemberIterator FindMember(const GenericValue<Encoding, SourceAllocator>& name) {
+        RAPIDJSON_ASSERT(IsObject());
+        RAPIDJSON_ASSERT(name.IsString());
+        return DoFindMember(name);
+    }
+    template <typename SourceAllocator> ConstMemberIterator FindMember(const GenericValue<Encoding, SourceAllocator>& name) const { return const_cast<GenericValue&>(*this).FindMember(name); }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Find member by string object name.
+    /*!
+        \param name Member name to be searched.
+        \pre IsObject() == true
+        \return Iterator to member, if it exists.
+            Otherwise returns \ref MemberEnd().
+    */
+    MemberIterator FindMember(const std::basic_string<Ch>& name) { return FindMember(GenericValue(StringRef(name))); }
+    ConstMemberIterator FindMember(const std::basic_string<Ch>& name) const { return FindMember(GenericValue(StringRef(name))); }
+#endif
+
+    //! Add a member (name-value pair) to the object.
+    /*! \param name A string value as name of member.
+        \param value Value of any type.
+        \param allocator    Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \note The ownership of \c name and \c value will be transferred to this object on success.
+        \pre  IsObject() && name.IsString()
+        \post name.IsNull() && value.IsNull()
+        \note Amortized Constant time complexity.
+    */
+    GenericValue& AddMember(GenericValue& name, GenericValue& value, Allocator& allocator) {
+        RAPIDJSON_ASSERT(IsObject());
+        RAPIDJSON_ASSERT(name.IsString());
+        DoAddMember(name, value, allocator);
+        return *this;
+    }
+
+    //! Add a constant string value as member (name-value pair) to the object.
+    /*! \param name A string value as name of member.
+        \param value constant string reference as value of member.
+        \param allocator    Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \pre  IsObject()
+        \note This overload is needed to avoid clashes with the generic primitive type AddMember(GenericValue&,T,Allocator&) overload below.
+        \note Amortized Constant time complexity.
+    */
+    GenericValue& AddMember(GenericValue& name, StringRefType value, Allocator& allocator) {
+        GenericValue v(value);
+        return AddMember(name, v, allocator);
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Add a string object as member (name-value pair) to the object.
+    /*! \param name A string value as name of member.
+        \param value constant string reference as value of member.
+        \param allocator    Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \pre  IsObject()
+        \note This overload is needed to avoid clashes with the generic primitive type AddMember(GenericValue&,T,Allocator&) overload below.
+        \note Amortized Constant time complexity.
+    */
+    GenericValue& AddMember(GenericValue& name, std::basic_string<Ch>& value, Allocator& allocator) {
+        GenericValue v(value, allocator);
+        return AddMember(name, v, allocator);
+    }
+#endif
+
+    //! Add any primitive value as member (name-value pair) to the object.
+    /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t
+        \param name A string value as name of member.
+        \param value Value of primitive type \c T as value of member
+        \param allocator Allocator for reallocating memory. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \pre  IsObject()
+
+        \note The source type \c T explicitly disallows all pointer types,
+            especially (\c const) \ref Ch*.  This helps avoiding implicitly
+            referencing character strings with insufficient lifetime, use
+            \ref AddMember(StringRefType, GenericValue&, Allocator&) or \ref
+            AddMember(StringRefType, StringRefType, Allocator&).
+            All other pointer types would implicitly convert to \c bool,
+            use an explicit cast instead, if needed.
+        \note Amortized Constant time complexity.
+    */
+    template <typename T>
+    RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (GenericValue&))
+    AddMember(GenericValue& name, T value, Allocator& allocator) {
+        GenericValue v(value);
+        return AddMember(name, v, allocator);
+    }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    GenericValue& AddMember(GenericValue&& name, GenericValue&& value, Allocator& allocator) {
+        return AddMember(name, value, allocator);
+    }
+    GenericValue& AddMember(GenericValue&& name, GenericValue& value, Allocator& allocator) {
+        return AddMember(name, value, allocator);
+    }
+    GenericValue& AddMember(GenericValue& name, GenericValue&& value, Allocator& allocator) {
+        return AddMember(name, value, allocator);
+    }
+    GenericValue& AddMember(StringRefType name, GenericValue&& value, Allocator& allocator) {
+        GenericValue n(name);
+        return AddMember(n, value, allocator);
+    }
+#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS
+
+
+    //! Add a member (name-value pair) to the object.
+    /*! \param name A constant string reference as name of member.
+        \param value Value of any type.
+        \param allocator    Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \note The ownership of \c value will be transferred to this object on success.
+        \pre  IsObject()
+        \post value.IsNull()
+        \note Amortized Constant time complexity.
+    */
+    GenericValue& AddMember(StringRefType name, GenericValue& value, Allocator& allocator) {
+        GenericValue n(name);
+        return AddMember(n, value, allocator);
+    }
+
+    //! Add a constant string value as member (name-value pair) to the object.
+    /*! \param name A constant string reference as name of member.
+        \param value constant string reference as value of member.
+        \param allocator    Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \pre  IsObject()
+        \note This overload is needed to avoid clashes with the generic primitive type AddMember(StringRefType,T,Allocator&) overload below.
+        \note Amortized Constant time complexity.
+    */
+    GenericValue& AddMember(StringRefType name, StringRefType value, Allocator& allocator) {
+        GenericValue v(value);
+        return AddMember(name, v, allocator);
+    }
+
+    //! Add any primitive value as member (name-value pair) to the object.
+    /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t
+        \param name A constant string reference as name of member.
+        \param value Value of primitive type \c T as value of member
+        \param allocator Allocator for reallocating memory. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \pre  IsObject()
+
+        \note The source type \c T explicitly disallows all pointer types,
+            especially (\c const) \ref Ch*.  This helps avoiding implicitly
+            referencing character strings with insufficient lifetime, use
+            \ref AddMember(StringRefType, GenericValue&, Allocator&) or \ref
+            AddMember(StringRefType, StringRefType, Allocator&).
+            All other pointer types would implicitly convert to \c bool,
+            use an explicit cast instead, if needed.
+        \note Amortized Constant time complexity.
+    */
+    template <typename T>
+    RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (GenericValue&))
+    AddMember(StringRefType name, T value, Allocator& allocator) {
+        GenericValue n(name);
+        return AddMember(n, value, allocator);
+    }
+
+    //! Remove all members in the object.
+    /*! This function do not deallocate memory in the object, i.e. the capacity is unchanged.
+        \note Linear time complexity.
+    */
+    void RemoveAllMembers() {
+        RAPIDJSON_ASSERT(IsObject()); 
+        DoClearMembers();
+    }
+
+    //! Remove a member in object by its name.
+    /*! \param name Name of member to be removed.
+        \return Whether the member existed.
+        \note This function may reorder the object members. Use \ref
+            EraseMember(ConstMemberIterator) if you need to preserve the
+            relative order of the remaining members.
+        \note Linear time complexity.
+    */
+    bool RemoveMember(const Ch* name) {
+        GenericValue n(StringRef(name));
+        return RemoveMember(n);
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    bool RemoveMember(const std::basic_string<Ch>& name) { return RemoveMember(GenericValue(StringRef(name))); }
+#endif
+
+    template <typename SourceAllocator>
+    bool RemoveMember(const GenericValue<Encoding, SourceAllocator>& name) {
+        MemberIterator m = FindMember(name);
+        if (m != MemberEnd()) {
+            RemoveMember(m);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    //! Remove a member in object by iterator.
+    /*! \param m member iterator (obtained by FindMember() or MemberBegin()).
+        \return the new iterator after removal.
+        \note This function may reorder the object members. Use \ref
+            EraseMember(ConstMemberIterator) if you need to preserve the
+            relative order of the remaining members.
+        \note Constant time complexity.
+    */
+    MemberIterator RemoveMember(MemberIterator m) {
+        RAPIDJSON_ASSERT(IsObject());
+        RAPIDJSON_ASSERT(data_.o.size > 0);
+        RAPIDJSON_ASSERT(GetMembersPointer() != 0);
+        RAPIDJSON_ASSERT(m >= MemberBegin() && m < MemberEnd());
+        return DoRemoveMember(m);
+    }
+
+    //! Remove a member from an object by iterator.
+    /*! \param pos iterator to the member to remove
+        \pre IsObject() == true && \ref MemberBegin() <= \c pos < \ref MemberEnd()
+        \return Iterator following the removed element.
+            If the iterator \c pos refers to the last element, the \ref MemberEnd() iterator is returned.
+        \note This function preserves the relative order of the remaining object
+            members. If you do not need this, use the more efficient \ref RemoveMember(MemberIterator).
+        \note Linear time complexity.
+    */
+    MemberIterator EraseMember(ConstMemberIterator pos) {
+        return EraseMember(pos, pos +1);
+    }
+
+    //! Remove members in the range [first, last) from an object.
+    /*! \param first iterator to the first member to remove
+        \param last  iterator following the last member to remove
+        \pre IsObject() == true && \ref MemberBegin() <= \c first <= \c last <= \ref MemberEnd()
+        \return Iterator following the last removed element.
+        \note This function preserves the relative order of the remaining object
+            members.
+        \note Linear time complexity.
+    */
+    MemberIterator EraseMember(ConstMemberIterator first, ConstMemberIterator last) {
+        RAPIDJSON_ASSERT(IsObject());
+        RAPIDJSON_ASSERT(data_.o.size > 0);
+        RAPIDJSON_ASSERT(GetMembersPointer() != 0);
+        RAPIDJSON_ASSERT(first >= MemberBegin());
+        RAPIDJSON_ASSERT(first <= last);
+        RAPIDJSON_ASSERT(last <= MemberEnd());
+        return DoEraseMembers(first, last);
+    }
+
+    //! Erase a member in object by its name.
+    /*! \param name Name of member to be removed.
+        \return Whether the member existed.
+        \note Linear time complexity.
+    */
+    bool EraseMember(const Ch* name) {
+        GenericValue n(StringRef(name));
+        return EraseMember(n);
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    bool EraseMember(const std::basic_string<Ch>& name) { return EraseMember(GenericValue(StringRef(name))); }
+#endif
+
+    template <typename SourceAllocator>
+    bool EraseMember(const GenericValue<Encoding, SourceAllocator>& name) {
+        MemberIterator m = FindMember(name);
+        if (m != MemberEnd()) {
+            EraseMember(m);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    Object GetObject() { RAPIDJSON_ASSERT(IsObject()); return Object(*this); }
+    Object GetObj() { RAPIDJSON_ASSERT(IsObject()); return Object(*this); }
+    ConstObject GetObject() const { RAPIDJSON_ASSERT(IsObject()); return ConstObject(*this); }
+    ConstObject GetObj() const { RAPIDJSON_ASSERT(IsObject()); return ConstObject(*this); }
+
+    //@}
+
+    //!@name Array
+    //@{
+
+    //! Set this value as an empty array.
+    /*! \post IsArray == true */
+    GenericValue& SetArray() { this->~GenericValue(); new (this) GenericValue(kArrayType); return *this; }
+
+    //! Get the number of elements in array.
+    SizeType Size() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.size; }
+
+    //! Get the capacity of array.
+    SizeType Capacity() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.capacity; }
+
+    //! Check whether the array is empty.
+    bool Empty() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.size == 0; }
+
+    //! Remove all elements in the array.
+    /*! This function do not deallocate memory in the array, i.e. the capacity is unchanged.
+        \note Linear time complexity.
+    */
+    void Clear() {
+        RAPIDJSON_ASSERT(IsArray()); 
+        GenericValue* e = GetElementsPointer();
+        for (GenericValue* v = e; v != e + data_.a.size; ++v)
+            v->~GenericValue();
+        data_.a.size = 0;
+    }
+
+    //! Get an element from array by index.
+    /*! \pre IsArray() == true
+        \param index Zero-based index of element.
+        \see operator[](T*)
+    */
+    GenericValue& operator[](SizeType index) {
+        RAPIDJSON_ASSERT(IsArray());
+        RAPIDJSON_ASSERT(index < data_.a.size);
+        return GetElementsPointer()[index];
+    }
+    const GenericValue& operator[](SizeType index) const { return const_cast<GenericValue&>(*this)[index]; }
+
+    //! Element iterator
+    /*! \pre IsArray() == true */
+    ValueIterator Begin() { RAPIDJSON_ASSERT(IsArray()); return GetElementsPointer(); }
+    //! \em Past-the-end element iterator
+    /*! \pre IsArray() == true */
+    ValueIterator End() { RAPIDJSON_ASSERT(IsArray()); return GetElementsPointer() + data_.a.size; }
+    //! Constant element iterator
+    /*! \pre IsArray() == true */
+    ConstValueIterator Begin() const { return const_cast<GenericValue&>(*this).Begin(); }
+    //! Constant \em past-the-end element iterator
+    /*! \pre IsArray() == true */
+    ConstValueIterator End() const { return const_cast<GenericValue&>(*this).End(); }
+
+    //! Request the array to have enough capacity to store elements.
+    /*! \param newCapacity  The capacity that the array at least need to have.
+        \param allocator    Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \note Linear time complexity.
+    */
+    GenericValue& Reserve(SizeType newCapacity, Allocator &allocator) {
+        RAPIDJSON_ASSERT(IsArray());
+        if (newCapacity > data_.a.capacity) {
+            SetElementsPointer(reinterpret_cast<GenericValue*>(allocator.Realloc(GetElementsPointer(), data_.a.capacity * sizeof(GenericValue), newCapacity * sizeof(GenericValue))));
+            data_.a.capacity = newCapacity;
+        }
+        return *this;
+    }
+
+    //! Append a GenericValue at the end of the array.
+    /*! \param value        Value to be appended.
+        \param allocator    Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator().
+        \pre IsArray() == true
+        \post value.IsNull() == true
+        \return The value itself for fluent API.
+        \note The ownership of \c value will be transferred to this array on success.
+        \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient.
+        \note Amortized constant time complexity.
+    */
+    GenericValue& PushBack(GenericValue& value, Allocator& allocator) {
+        RAPIDJSON_ASSERT(IsArray());
+        if (data_.a.size >= data_.a.capacity)
+            Reserve(data_.a.capacity == 0 ? kDefaultArrayCapacity : (data_.a.capacity + (data_.a.capacity + 1) / 2), allocator);
+        GetElementsPointer()[data_.a.size++].RawAssign(value);
+        return *this;
+    }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    GenericValue& PushBack(GenericValue&& value, Allocator& allocator) {
+        return PushBack(value, allocator);
+    }
+#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS
+
+    //! Append a constant string reference at the end of the array.
+    /*! \param value        Constant string reference to be appended.
+        \param allocator    Allocator for reallocating memory. It must be the same one used previously. Commonly use GenericDocument::GetAllocator().
+        \pre IsArray() == true
+        \return The value itself for fluent API.
+        \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient.
+        \note Amortized constant time complexity.
+        \see GenericStringRef
+    */
+    GenericValue& PushBack(StringRefType value, Allocator& allocator) {
+        return (*this).template PushBack<StringRefType>(value, allocator);
+    }
+
+    //! Append a primitive value at the end of the array.
+    /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t
+        \param value Value of primitive type T to be appended.
+        \param allocator    Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator().
+        \pre IsArray() == true
+        \return The value itself for fluent API.
+        \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient.
+
+        \note The source type \c T explicitly disallows all pointer types,
+            especially (\c const) \ref Ch*.  This helps avoiding implicitly
+            referencing character strings with insufficient lifetime, use
+            \ref PushBack(GenericValue&, Allocator&) or \ref
+            PushBack(StringRefType, Allocator&).
+            All other pointer types would implicitly convert to \c bool,
+            use an explicit cast instead, if needed.
+        \note Amortized constant time complexity.
+    */
+    template <typename T>
+    RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (GenericValue&))
+    PushBack(T value, Allocator& allocator) {
+        GenericValue v(value);
+        return PushBack(v, allocator);
+    }
+
+    //! Remove the last element in the array.
+    /*!
+        \note Constant time complexity.
+    */
+    GenericValue& PopBack() {
+        RAPIDJSON_ASSERT(IsArray());
+        RAPIDJSON_ASSERT(!Empty());
+        GetElementsPointer()[--data_.a.size].~GenericValue();
+        return *this;
+    }
+
+    //! Remove an element of array by iterator.
+    /*!
+        \param pos iterator to the element to remove
+        \pre IsArray() == true && \ref Begin() <= \c pos < \ref End()
+        \return Iterator following the removed element. If the iterator pos refers to the last element, the End() iterator is returned.
+        \note Linear time complexity.
+    */
+    ValueIterator Erase(ConstValueIterator pos) {
+        return Erase(pos, pos + 1);
+    }
+
+    //! Remove elements in the range [first, last) of the array.
+    /*!
+        \param first iterator to the first element to remove
+        \param last  iterator following the last element to remove
+        \pre IsArray() == true && \ref Begin() <= \c first <= \c last <= \ref End()
+        \return Iterator following the last removed element.
+        \note Linear time complexity.
+    */
+    ValueIterator Erase(ConstValueIterator first, ConstValueIterator last) {
+        RAPIDJSON_ASSERT(IsArray());
+        RAPIDJSON_ASSERT(data_.a.size > 0);
+        RAPIDJSON_ASSERT(GetElementsPointer() != 0);
+        RAPIDJSON_ASSERT(first >= Begin());
+        RAPIDJSON_ASSERT(first <= last);
+        RAPIDJSON_ASSERT(last <= End());
+        ValueIterator pos = Begin() + (first - Begin());
+        for (ValueIterator itr = pos; itr != last; ++itr)
+            itr->~GenericValue();
+        std::memmove(static_cast<void*>(pos), last, static_cast<size_t>(End() - last) * sizeof(GenericValue));
+        data_.a.size -= static_cast<SizeType>(last - first);
+        return pos;
+    }
+
+    Array GetArray() { RAPIDJSON_ASSERT(IsArray()); return Array(*this); }
+    ConstArray GetArray() const { RAPIDJSON_ASSERT(IsArray()); return ConstArray(*this); }
+
+    //@}
+
+    //!@name Number
+    //@{
+
+    int GetInt() const          { RAPIDJSON_ASSERT(data_.f.flags & kIntFlag);   return data_.n.i.i;   }
+    unsigned GetUint() const    { RAPIDJSON_ASSERT(data_.f.flags & kUintFlag);  return data_.n.u.u;   }
+    int64_t GetInt64() const    { RAPIDJSON_ASSERT(data_.f.flags & kInt64Flag); return data_.n.i64; }
+    uint64_t GetUint64() const  { RAPIDJSON_ASSERT(data_.f.flags & kUint64Flag); return data_.n.u64; }
+
+    //! Get the value as double type.
+    /*! \note If the value is 64-bit integer type, it may lose precision. Use \c IsLosslessDouble() to check whether the converison is lossless.
+    */
+    double GetDouble() const {
+        RAPIDJSON_ASSERT(IsNumber());
+        if ((data_.f.flags & kDoubleFlag) != 0)                return data_.n.d;   // exact type, no conversion.
+        if ((data_.f.flags & kIntFlag) != 0)                   return data_.n.i.i; // int -> double
+        if ((data_.f.flags & kUintFlag) != 0)                  return data_.n.u.u; // unsigned -> double
+        if ((data_.f.flags & kInt64Flag) != 0)                 return static_cast<double>(data_.n.i64); // int64_t -> double (may lose precision)
+        RAPIDJSON_ASSERT((data_.f.flags & kUint64Flag) != 0);  return static_cast<double>(data_.n.u64); // uint64_t -> double (may lose precision)
+    }
+
+    //! Get the value as float type.
+    /*! \note If the value is 64-bit integer type, it may lose precision. Use \c IsLosslessFloat() to check whether the converison is lossless.
+    */
+    float GetFloat() const {
+        return static_cast<float>(GetDouble());
+    }
+
+    GenericValue& SetInt(int i)             { this->~GenericValue(); new (this) GenericValue(i);    return *this; }
+    GenericValue& SetUint(unsigned u)       { this->~GenericValue(); new (this) GenericValue(u);    return *this; }
+    GenericValue& SetInt64(int64_t i64)     { this->~GenericValue(); new (this) GenericValue(i64);  return *this; }
+    GenericValue& SetUint64(uint64_t u64)   { this->~GenericValue(); new (this) GenericValue(u64);  return *this; }
+    GenericValue& SetDouble(double d)       { this->~GenericValue(); new (this) GenericValue(d);    return *this; }
+    GenericValue& SetFloat(float f)         { this->~GenericValue(); new (this) GenericValue(static_cast<double>(f)); return *this; }
+
+    //@}
+
+    //!@name String
+    //@{
+
+    const Ch* GetString() const { RAPIDJSON_ASSERT(IsString()); return DataString(data_); }
+
+    //! Get the length of string.
+    /*! Since rapidjson permits "\\u0000" in the json string, strlen(v.GetString()) may not equal to v.GetStringLength().
+    */
+    SizeType GetStringLength() const { RAPIDJSON_ASSERT(IsString()); return DataStringLength(data_); }
+
+    //! Set this value as a string without copying source string.
+    /*! This version has better performance with supplied length, and also support string containing null character.
+        \param s source string pointer. 
+        \param length The length of source string, excluding the trailing null terminator.
+        \return The value itself for fluent API.
+        \post IsString() == true && GetString() == s && GetStringLength() == length
+        \see SetString(StringRefType)
+    */
+    GenericValue& SetString(const Ch* s, SizeType length) { return SetString(StringRef(s, length)); }
+
+    //! Set this value as a string without copying source string.
+    /*! \param s source string reference
+        \return The value itself for fluent API.
+        \post IsString() == true && GetString() == s && GetStringLength() == s.length
+    */
+    GenericValue& SetString(StringRefType s) { this->~GenericValue(); SetStringRaw(s); return *this; }
+
+    //! Set this value as a string by copying from source string.
+    /*! This version has better performance with supplied length, and also support string containing null character.
+        \param s source string. 
+        \param length The length of source string, excluding the trailing null terminator.
+        \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \post IsString() == true && GetString() != s && strcmp(GetString(),s) == 0 && GetStringLength() == length
+    */
+    GenericValue& SetString(const Ch* s, SizeType length, Allocator& allocator) { return SetString(StringRef(s, length), allocator); }
+
+    //! Set this value as a string by copying from source string.
+    /*! \param s source string. 
+        \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \post IsString() == true && GetString() != s && strcmp(GetString(),s) == 0 && GetStringLength() == length
+    */
+    GenericValue& SetString(const Ch* s, Allocator& allocator) { return SetString(StringRef(s), allocator); }
+
+    //! Set this value as a string by copying from source string.
+    /*! \param s source string reference
+        \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \post IsString() == true && GetString() != s.s && strcmp(GetString(),s) == 0 && GetStringLength() == length
+    */
+    GenericValue& SetString(StringRefType s, Allocator& allocator) { this->~GenericValue(); SetStringRaw(s, allocator); return *this; }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Set this value as a string by copying from source string.
+    /*! \param s source string.
+        \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator().
+        \return The value itself for fluent API.
+        \post IsString() == true && GetString() != s.data() && strcmp(GetString(),s.data() == 0 && GetStringLength() == s.size()
+        \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING.
+    */
+    GenericValue& SetString(const std::basic_string<Ch>& s, Allocator& allocator) { return SetString(StringRef(s), allocator); }
+#endif
+
+    //@}
+
+    //!@name Array
+    //@{
+
+    //! Templated version for checking whether this value is type T.
+    /*!
+        \tparam T Either \c bool, \c int, \c unsigned, \c int64_t, \c uint64_t, \c double, \c float, \c const \c char*, \c std::basic_string<Ch>
+    */
+    template <typename T>
+    bool Is() const { return internal::TypeHelper<ValueType, T>::Is(*this); }
+
+    template <typename T>
+    T Get() const { return internal::TypeHelper<ValueType, T>::Get(*this); }
+
+    template <typename T>
+    T Get() { return internal::TypeHelper<ValueType, T>::Get(*this); }
+
+    template<typename T>
+    ValueType& Set(const T& data) { return internal::TypeHelper<ValueType, T>::Set(*this, data); }
+
+    template<typename T>
+    ValueType& Set(const T& data, AllocatorType& allocator) { return internal::TypeHelper<ValueType, T>::Set(*this, data, allocator); }
+
+    //@}
+
+    //! Generate events of this value to a Handler.
+    /*! This function adopts the GoF visitor pattern.
+        Typical usage is to output this JSON value as JSON text via Writer, which is a Handler.
+        It can also be used to deep clone this value via GenericDocument, which is also a Handler.
+        \tparam Handler type of handler.
+        \param handler An object implementing concept Handler.
+    */
+    template <typename Handler>
+    bool Accept(Handler& handler) const {
+        switch(GetType()) {
+        case kNullType:     return handler.Null();
+        case kFalseType:    return handler.Bool(false);
+        case kTrueType:     return handler.Bool(true);
+
+        case kObjectType:
+            if (RAPIDJSON_UNLIKELY(!handler.StartObject()))
+                return false;
+            for (ConstMemberIterator m = MemberBegin(); m != MemberEnd(); ++m) {
+                RAPIDJSON_ASSERT(m->name.IsString()); // User may change the type of name by MemberIterator.
+                if (RAPIDJSON_UNLIKELY(!handler.Key(m->name.GetString(), m->name.GetStringLength(), (m->name.data_.f.flags & kCopyFlag) != 0)))
+                    return false;
+                if (RAPIDJSON_UNLIKELY(!m->value.Accept(handler)))
+                    return false;
+            }
+            return handler.EndObject(data_.o.size);
+
+        case kArrayType:
+            if (RAPIDJSON_UNLIKELY(!handler.StartArray()))
+                return false;
+            for (ConstValueIterator v = Begin(); v != End(); ++v)
+                if (RAPIDJSON_UNLIKELY(!v->Accept(handler)))
+                    return false;
+            return handler.EndArray(data_.a.size);
+    
+        case kStringType:
+            return handler.String(GetString(), GetStringLength(), (data_.f.flags & kCopyFlag) != 0);
+    
+        default:
+            RAPIDJSON_ASSERT(GetType() == kNumberType);
+            if (IsDouble())         return handler.Double(data_.n.d);
+            else if (IsInt())       return handler.Int(data_.n.i.i);
+            else if (IsUint())      return handler.Uint(data_.n.u.u);
+            else if (IsInt64())     return handler.Int64(data_.n.i64);
+            else                    return handler.Uint64(data_.n.u64);
+        }
+    }
+
+private:
+    template <typename, typename> friend class GenericValue;
+    template <typename, typename, typename> friend class GenericDocument;
+
+    enum {
+        kBoolFlag       = 0x0008,
+        kNumberFlag     = 0x0010,
+        kIntFlag        = 0x0020,
+        kUintFlag       = 0x0040,
+        kInt64Flag      = 0x0080,
+        kUint64Flag     = 0x0100,
+        kDoubleFlag     = 0x0200,
+        kStringFlag     = 0x0400,
+        kCopyFlag       = 0x0800,
+        kInlineStrFlag  = 0x1000,
+
+        // Initial flags of different types.
+        kNullFlag = kNullType,
+        // These casts are added to suppress the warning on MSVC about bitwise operations between enums of different types.
+        kTrueFlag = static_cast<int>(kTrueType) | static_cast<int>(kBoolFlag),
+        kFalseFlag = static_cast<int>(kFalseType) | static_cast<int>(kBoolFlag),
+        kNumberIntFlag = static_cast<int>(kNumberType) | static_cast<int>(kNumberFlag | kIntFlag | kInt64Flag),
+        kNumberUintFlag = static_cast<int>(kNumberType) | static_cast<int>(kNumberFlag | kUintFlag | kUint64Flag | kInt64Flag),
+        kNumberInt64Flag = static_cast<int>(kNumberType) | static_cast<int>(kNumberFlag | kInt64Flag),
+        kNumberUint64Flag = static_cast<int>(kNumberType) | static_cast<int>(kNumberFlag | kUint64Flag),
+        kNumberDoubleFlag = static_cast<int>(kNumberType) | static_cast<int>(kNumberFlag | kDoubleFlag),
+        kNumberAnyFlag = static_cast<int>(kNumberType) | static_cast<int>(kNumberFlag | kIntFlag | kInt64Flag | kUintFlag | kUint64Flag | kDoubleFlag),
+        kConstStringFlag = static_cast<int>(kStringType) | static_cast<int>(kStringFlag),
+        kCopyStringFlag = static_cast<int>(kStringType) | static_cast<int>(kStringFlag | kCopyFlag),
+        kShortStringFlag = static_cast<int>(kStringType) | static_cast<int>(kStringFlag | kCopyFlag | kInlineStrFlag),
+        kObjectFlag = kObjectType,
+        kArrayFlag = kArrayType,
+
+        kTypeMask = 0x07
+    };
+
+    static const SizeType kDefaultArrayCapacity = RAPIDJSON_VALUE_DEFAULT_ARRAY_CAPACITY;
+    static const SizeType kDefaultObjectCapacity = RAPIDJSON_VALUE_DEFAULT_OBJECT_CAPACITY;
+
+    struct Flag {
+#if RAPIDJSON_48BITPOINTER_OPTIMIZATION
+        char payload[sizeof(SizeType) * 2 + 6];     // 2 x SizeType + lower 48-bit pointer
+#elif RAPIDJSON_64BIT
+        char payload[sizeof(SizeType) * 2 + sizeof(void*) + 6]; // 6 padding bytes
+#else
+        char payload[sizeof(SizeType) * 2 + sizeof(void*) + 2]; // 2 padding bytes
+#endif
+        uint16_t flags;
+    };
+
+    struct String {
+        SizeType length;
+        SizeType hashcode;  //!< reserved
+        const Ch* str;
+    };  // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode
+
+    // implementation detail: ShortString can represent zero-terminated strings up to MaxSize chars
+    // (excluding the terminating zero) and store a value to determine the length of the contained
+    // string in the last character str[LenPos] by storing "MaxSize - length" there. If the string
+    // to store has the maximal length of MaxSize then str[LenPos] will be 0 and therefore act as
+    // the string terminator as well. For getting the string length back from that value just use
+    // "MaxSize - str[LenPos]".
+    // This allows to store 13-chars strings in 32-bit mode, 21-chars strings in 64-bit mode,
+    // 13-chars strings for RAPIDJSON_48BITPOINTER_OPTIMIZATION=1 inline (for `UTF8`-encoded strings).
+    struct ShortString {
+        enum { MaxChars = sizeof(static_cast<Flag*>(0)->payload) / sizeof(Ch), MaxSize = MaxChars - 1, LenPos = MaxSize };
+        Ch str[MaxChars];
+
+        inline static bool Usable(SizeType len) { return                       (MaxSize >= len); }
+        inline void     SetLength(SizeType len) { str[LenPos] = static_cast<Ch>(MaxSize -  len); }
+        inline SizeType GetLength() const       { return  static_cast<SizeType>(MaxSize -  str[LenPos]); }
+    };  // at most as many bytes as "String" above => 12 bytes in 32-bit mode, 16 bytes in 64-bit mode
+
+    // By using proper binary layout, retrieval of different integer types do not need conversions.
+    union Number {
+#if RAPIDJSON_ENDIAN == RAPIDJSON_LITTLEENDIAN
+        struct I {
+            int i;
+            char padding[4];
+        }i;
+        struct U {
+            unsigned u;
+            char padding2[4];
+        }u;
+#else
+        struct I {
+            char padding[4];
+            int i;
+        }i;
+        struct U {
+            char padding2[4];
+            unsigned u;
+        }u;
+#endif
+        int64_t i64;
+        uint64_t u64;
+        double d;
+    };  // 8 bytes
+
+    struct ObjectData {
+        SizeType size;
+        SizeType capacity;
+        Member* members;
+    };  // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode
+
+    struct ArrayData {
+        SizeType size;
+        SizeType capacity;
+        GenericValue* elements;
+    };  // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode
+
+    union Data {
+        String s;
+        ShortString ss;
+        Number n;
+        ObjectData o;
+        ArrayData a;
+        Flag f;
+    };  // 16 bytes in 32-bit mode, 24 bytes in 64-bit mode, 16 bytes in 64-bit with RAPIDJSON_48BITPOINTER_OPTIMIZATION
+
+    static RAPIDJSON_FORCEINLINE const Ch* DataString(const Data& data) {
+        return (data.f.flags & kInlineStrFlag) ? data.ss.str : RAPIDJSON_GETPOINTER(Ch, data.s.str);
+    }
+    static RAPIDJSON_FORCEINLINE SizeType DataStringLength(const Data& data) {
+        return (data.f.flags & kInlineStrFlag) ? data.ss.GetLength() : data.s.length;
+    }
+
+    RAPIDJSON_FORCEINLINE const Ch* GetStringPointer() const { return RAPIDJSON_GETPOINTER(Ch, data_.s.str); }
+    RAPIDJSON_FORCEINLINE const Ch* SetStringPointer(const Ch* str) { return RAPIDJSON_SETPOINTER(Ch, data_.s.str, str); }
+    RAPIDJSON_FORCEINLINE GenericValue* GetElementsPointer() const { return RAPIDJSON_GETPOINTER(GenericValue, data_.a.elements); }
+    RAPIDJSON_FORCEINLINE GenericValue* SetElementsPointer(GenericValue* elements) { return RAPIDJSON_SETPOINTER(GenericValue, data_.a.elements, elements); }
+    RAPIDJSON_FORCEINLINE Member* GetMembersPointer() const { return RAPIDJSON_GETPOINTER(Member, data_.o.members); }
+    RAPIDJSON_FORCEINLINE Member* SetMembersPointer(Member* members) { return RAPIDJSON_SETPOINTER(Member, data_.o.members, members); }
+
+#if RAPIDJSON_USE_MEMBERSMAP
+
+    struct MapTraits {
+        struct Less {
+            bool operator()(const Data& s1, const Data& s2) const {
+                SizeType n1 = DataStringLength(s1), n2 = DataStringLength(s2);
+                int cmp = std::memcmp(DataString(s1), DataString(s2), sizeof(Ch) * (n1 < n2 ? n1 : n2));
+                return cmp < 0 || (cmp == 0 && n1 < n2);
+            }
+        };
+        typedef std::pair<const Data, SizeType> Pair;
+        typedef std::multimap<Data, SizeType, Less, StdAllocator<Pair, Allocator> > Map;
+        typedef typename Map::iterator Iterator;
+    };
+    typedef typename MapTraits::Map         Map;
+    typedef typename MapTraits::Less        MapLess;
+    typedef typename MapTraits::Pair        MapPair;
+    typedef typename MapTraits::Iterator    MapIterator;
+
+    //
+    // Layout of the members' map/array, re(al)located according to the needed capacity:
+    //
+    //    {Map*}<>{capacity}<>{Member[capacity]}<>{MapIterator[capacity]}
+    //
+    // (where <> stands for the RAPIDJSON_ALIGN-ment, if needed)
+    //
+
+    static RAPIDJSON_FORCEINLINE size_t GetMapLayoutSize(SizeType capacity) {
+        return RAPIDJSON_ALIGN(sizeof(Map*)) +
+               RAPIDJSON_ALIGN(sizeof(SizeType)) +
+               RAPIDJSON_ALIGN(capacity * sizeof(Member)) +
+               capacity * sizeof(MapIterator);
+    }
+
+    static RAPIDJSON_FORCEINLINE SizeType &GetMapCapacity(Map* &map) {
+        return *reinterpret_cast<SizeType*>(reinterpret_cast<uintptr_t>(&map) +
+                                            RAPIDJSON_ALIGN(sizeof(Map*)));
+    }
+
+    static RAPIDJSON_FORCEINLINE Member* GetMapMembers(Map* &map) {
+        return reinterpret_cast<Member*>(reinterpret_cast<uintptr_t>(&map) +
+                                         RAPIDJSON_ALIGN(sizeof(Map*)) +
+                                         RAPIDJSON_ALIGN(sizeof(SizeType)));
+    }
+
+    static RAPIDJSON_FORCEINLINE MapIterator* GetMapIterators(Map* &map) {
+        return reinterpret_cast<MapIterator*>(reinterpret_cast<uintptr_t>(&map) +
+                                              RAPIDJSON_ALIGN(sizeof(Map*)) +
+                                              RAPIDJSON_ALIGN(sizeof(SizeType)) +
+                                              RAPIDJSON_ALIGN(GetMapCapacity(map) * sizeof(Member)));
+    }
+
+    static RAPIDJSON_FORCEINLINE Map* &GetMap(Member* members) {
+        RAPIDJSON_ASSERT(members != 0);
+        return *reinterpret_cast<Map**>(reinterpret_cast<uintptr_t>(members) -
+                                        RAPIDJSON_ALIGN(sizeof(SizeType)) -
+                                        RAPIDJSON_ALIGN(sizeof(Map*)));
+    }
+
+    // Some compilers' debug mechanisms want all iterators to be destroyed, for their accounting..
+    RAPIDJSON_FORCEINLINE MapIterator DropMapIterator(MapIterator& rhs) {
+#if RAPIDJSON_HAS_CXX11
+        MapIterator ret = std::move(rhs);
+#else
+        MapIterator ret = rhs;
+#endif
+        rhs.~MapIterator();
+        return ret;
+    }
+
+    Map* &DoReallocMap(Map** oldMap, SizeType newCapacity, Allocator& allocator) {
+        Map **newMap = static_cast<Map**>(allocator.Malloc(GetMapLayoutSize(newCapacity)));
+        GetMapCapacity(*newMap) = newCapacity;
+        if (!oldMap) {
+            *newMap = new (allocator.Malloc(sizeof(Map))) Map(MapLess(), allocator);
+        }
+        else {
+            *newMap = *oldMap;
+            size_t count = (*oldMap)->size();
+            std::memcpy(static_cast<void*>(GetMapMembers(*newMap)),
+                        static_cast<void*>(GetMapMembers(*oldMap)),
+                        count * sizeof(Member));
+            MapIterator *oldIt = GetMapIterators(*oldMap),
+                        *newIt = GetMapIterators(*newMap);
+            while (count--) {
+                new (&newIt[count]) MapIterator(DropMapIterator(oldIt[count]));
+            }
+            Allocator::Free(oldMap);
+        }
+        return *newMap;
+    }
+
+    RAPIDJSON_FORCEINLINE Member* DoAllocMembers(SizeType capacity, Allocator& allocator) {
+        return GetMapMembers(DoReallocMap(0, capacity, allocator));
+    }
+
+    void DoReserveMembers(SizeType newCapacity, Allocator& allocator) {
+        ObjectData& o = data_.o;
+        if (newCapacity > o.capacity) {
+            Member* oldMembers = GetMembersPointer();
+            Map **oldMap = oldMembers ? &GetMap(oldMembers) : 0,
+                *&newMap = DoReallocMap(oldMap, newCapacity, allocator);
+            RAPIDJSON_SETPOINTER(Member, o.members, GetMapMembers(newMap));
+            o.capacity = newCapacity;
+        }
+    }
+
+    template <typename SourceAllocator>
+    MemberIterator DoFindMember(const GenericValue<Encoding, SourceAllocator>& name) {
+        if (Member* members = GetMembersPointer()) {
+            Map* &map = GetMap(members);
+            MapIterator mit = map->find(reinterpret_cast<const Data&>(name.data_));
+            if (mit != map->end()) {
+                return MemberIterator(&members[mit->second]);
+            }
+        }
+        return MemberEnd();
+    }
+
+    void DoClearMembers() {
+        if (Member* members = GetMembersPointer()) {
+            Map* &map = GetMap(members);
+            MapIterator* mit = GetMapIterators(map);
+            for (SizeType i = 0; i < data_.o.size; i++) {
+                map->erase(DropMapIterator(mit[i]));
+                members[i].~Member();
+            }
+            data_.o.size = 0;
+        }
+    }
+
+    void DoFreeMembers() {
+        if (Member* members = GetMembersPointer()) {
+            GetMap(members)->~Map();
+            for (SizeType i = 0; i < data_.o.size; i++) {
+                members[i].~Member();
+            }
+            if (Allocator::kNeedFree) { // Shortcut by Allocator's trait
+                Map** map = &GetMap(members);
+                Allocator::Free(*map);
+                Allocator::Free(map);
+            }
+        }
+    }
+
+#else // !RAPIDJSON_USE_MEMBERSMAP
+
+    RAPIDJSON_FORCEINLINE Member* DoAllocMembers(SizeType capacity, Allocator& allocator) {
+        return Malloc<Member>(allocator, capacity);
+    }
+
+    void DoReserveMembers(SizeType newCapacity, Allocator& allocator) {
+        ObjectData& o = data_.o;
+        if (newCapacity > o.capacity) {
+            Member* newMembers = Realloc<Member>(allocator, GetMembersPointer(), o.capacity, newCapacity);
+            RAPIDJSON_SETPOINTER(Member, o.members, newMembers);
+            o.capacity = newCapacity;
+        }
+    }
+
+    template <typename SourceAllocator>
+    MemberIterator DoFindMember(const GenericValue<Encoding, SourceAllocator>& name) {
+        MemberIterator member = MemberBegin();
+        for ( ; member != MemberEnd(); ++member)
+            if (name.StringEqual(member->name))
+                break;
+        return member;
+    }
+
+    void DoClearMembers() {
+        for (MemberIterator m = MemberBegin(); m != MemberEnd(); ++m)
+            m->~Member();
+        data_.o.size = 0;
+    }
+
+    void DoFreeMembers() {
+        for (MemberIterator m = MemberBegin(); m != MemberEnd(); ++m)
+            m->~Member();
+        Allocator::Free(GetMembersPointer());
+    }
+
+#endif // !RAPIDJSON_USE_MEMBERSMAP
+
+    void DoAddMember(GenericValue& name, GenericValue& value, Allocator& allocator) {
+        ObjectData& o = data_.o;
+        if (o.size >= o.capacity)
+            DoReserveMembers(o.capacity ? (o.capacity + (o.capacity + 1) / 2) : kDefaultObjectCapacity, allocator);
+        Member* members = GetMembersPointer();
+        Member* m = members + o.size;
+        m->name.RawAssign(name);
+        m->value.RawAssign(value);
+#if RAPIDJSON_USE_MEMBERSMAP
+        Map* &map = GetMap(members);
+        MapIterator* mit = GetMapIterators(map);
+        new (&mit[o.size]) MapIterator(map->insert(MapPair(m->name.data_, o.size)));
+#endif
+        ++o.size;
+    }
+
+    MemberIterator DoRemoveMember(MemberIterator m) {
+        ObjectData& o = data_.o;
+        Member* members = GetMembersPointer();
+#if RAPIDJSON_USE_MEMBERSMAP
+        Map* &map = GetMap(members);
+        MapIterator* mit = GetMapIterators(map);
+        SizeType mpos = static_cast<SizeType>(&*m - members);
+        map->erase(DropMapIterator(mit[mpos]));
+#endif
+        MemberIterator last(members + (o.size - 1));
+        if (o.size > 1 && m != last) {
+#if RAPIDJSON_USE_MEMBERSMAP
+            new (&mit[mpos]) MapIterator(DropMapIterator(mit[&*last - members]));
+            mit[mpos]->second = mpos;
+#endif
+            *m = *last; // Move the last one to this place
+        }
+        else {
+            m->~Member(); // Only one left, just destroy
+        }
+        --o.size;
+        return m;
+    }
+
+    MemberIterator DoEraseMembers(ConstMemberIterator first, ConstMemberIterator last) {
+        ObjectData& o = data_.o;
+        MemberIterator beg = MemberBegin(),
+                       pos = beg + (first - beg),
+                       end = MemberEnd();
+#if RAPIDJSON_USE_MEMBERSMAP
+        Map* &map = GetMap(GetMembersPointer());
+        MapIterator* mit = GetMapIterators(map);
+#endif
+        for (MemberIterator itr = pos; itr != last; ++itr) {
+#if RAPIDJSON_USE_MEMBERSMAP
+            map->erase(DropMapIterator(mit[itr - beg]));
+#endif
+            itr->~Member();
+        }
+#if RAPIDJSON_USE_MEMBERSMAP
+        if (first != last) {
+            // Move remaining members/iterators
+            MemberIterator next = pos + (last - first);
+            for (MemberIterator itr = pos; next != end; ++itr, ++next) {
+                std::memcpy(static_cast<void*>(&*itr), &*next, sizeof(Member));
+                SizeType mpos = static_cast<SizeType>(itr - beg);
+                new (&mit[mpos]) MapIterator(DropMapIterator(mit[next - beg]));
+                mit[mpos]->second = mpos;
+            }
+        }
+#else
+        std::memmove(static_cast<void*>(&*pos), &*last,
+                     static_cast<size_t>(end - last) * sizeof(Member));
+#endif
+        o.size -= static_cast<SizeType>(last - first);
+        return pos;
+    }
+
+    template <typename SourceAllocator>
+    void DoCopyMembers(const GenericValue<Encoding,SourceAllocator>& rhs, Allocator& allocator, bool copyConstStrings) {
+        RAPIDJSON_ASSERT(rhs.GetType() == kObjectType);
+
+        data_.f.flags = kObjectFlag;
+        SizeType count = rhs.data_.o.size;
+        Member* lm = DoAllocMembers(count, allocator);
+        const typename GenericValue<Encoding,SourceAllocator>::Member* rm = rhs.GetMembersPointer();
+#if RAPIDJSON_USE_MEMBERSMAP
+        Map* &map = GetMap(lm);
+        MapIterator* mit = GetMapIterators(map);
+#endif
+        for (SizeType i = 0; i < count; i++) {
+            new (&lm[i].name) GenericValue(rm[i].name, allocator, copyConstStrings);
+            new (&lm[i].value) GenericValue(rm[i].value, allocator, copyConstStrings);
+#if RAPIDJSON_USE_MEMBERSMAP
+            new (&mit[i]) MapIterator(map->insert(MapPair(lm[i].name.data_, i)));
+#endif
+        }
+        data_.o.size = data_.o.capacity = count;
+        SetMembersPointer(lm);
+    }
+
+    // Initialize this value as array with initial data, without calling destructor.
+    void SetArrayRaw(GenericValue* values, SizeType count, Allocator& allocator) {
+        data_.f.flags = kArrayFlag;
+        if (count) {
+            GenericValue* e = static_cast<GenericValue*>(allocator.Malloc(count * sizeof(GenericValue)));
+            SetElementsPointer(e);
+            std::memcpy(static_cast<void*>(e), values, count * sizeof(GenericValue));
+        }
+        else
+            SetElementsPointer(0);
+        data_.a.size = data_.a.capacity = count;
+    }
+
+    //! Initialize this value as object with initial data, without calling destructor.
+    void SetObjectRaw(Member* members, SizeType count, Allocator& allocator) {
+        data_.f.flags = kObjectFlag;
+        if (count) {
+            Member* m = DoAllocMembers(count, allocator);
+            SetMembersPointer(m);
+            std::memcpy(static_cast<void*>(m), members, count * sizeof(Member));
+#if RAPIDJSON_USE_MEMBERSMAP
+            Map* &map = GetMap(m);
+            MapIterator* mit = GetMapIterators(map);
+            for (SizeType i = 0; i < count; i++) {
+                new (&mit[i]) MapIterator(map->insert(MapPair(m[i].name.data_, i)));
+            }
+#endif
+        }
+        else
+            SetMembersPointer(0);
+        data_.o.size = data_.o.capacity = count;
+    }
+
+    //! Initialize this value as constant string, without calling destructor.
+    void SetStringRaw(StringRefType s) RAPIDJSON_NOEXCEPT {
+        data_.f.flags = kConstStringFlag;
+        SetStringPointer(s);
+        data_.s.length = s.length;
+    }
+
+    //! Initialize this value as copy string with initial data, without calling destructor.
+    void SetStringRaw(StringRefType s, Allocator& allocator) {
+        Ch* str = 0;
+        if (ShortString::Usable(s.length)) {
+            data_.f.flags = kShortStringFlag;
+            data_.ss.SetLength(s.length);
+            str = data_.ss.str;
+            std::memmove(str, s, s.length * sizeof(Ch));
+        } else {
+            data_.f.flags = kCopyStringFlag;
+            data_.s.length = s.length;
+            str = static_cast<Ch *>(allocator.Malloc((s.length + 1) * sizeof(Ch)));
+            SetStringPointer(str);
+            std::memcpy(str, s, s.length * sizeof(Ch));
+        }
+        str[s.length] = '\0';
+    }
+
+    //! Assignment without calling destructor
+    void RawAssign(GenericValue& rhs) RAPIDJSON_NOEXCEPT {
+        data_ = rhs.data_;
+        // data_.f.flags = rhs.data_.f.flags;
+        rhs.data_.f.flags = kNullFlag;
+    }
+
+    template <typename SourceAllocator>
+    bool StringEqual(const GenericValue<Encoding, SourceAllocator>& rhs) const {
+        RAPIDJSON_ASSERT(IsString());
+        RAPIDJSON_ASSERT(rhs.IsString());
+
+        const SizeType len1 = GetStringLength();
+        const SizeType len2 = rhs.GetStringLength();
+        if(len1 != len2) { return false; }
+
+        const Ch* const str1 = GetString();
+        const Ch* const str2 = rhs.GetString();
+        if(str1 == str2) { return true; } // fast path for constant string
+
+        return (std::memcmp(str1, str2, sizeof(Ch) * len1) == 0);
+    }
+
+    Data data_;
+};
+
+//! GenericValue with UTF8 encoding
+typedef GenericValue<UTF8<> > Value;
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericDocument 
+
+//! A document for parsing JSON text as DOM.
+/*!
+    \note implements Handler concept
+    \tparam Encoding Encoding for both parsing and string storage.
+    \tparam Allocator Allocator for allocating memory for the DOM
+    \tparam StackAllocator Allocator for allocating memory for stack during parsing.
+    \warning Although GenericDocument inherits from GenericValue, the API does \b not provide any virtual functions, especially no virtual destructor.  To avoid memory leaks, do not \c delete a GenericDocument object via a pointer to a GenericValue.
+*/
+template <typename Encoding, typename Allocator = RAPIDJSON_DEFAULT_ALLOCATOR, typename StackAllocator = RAPIDJSON_DEFAULT_STACK_ALLOCATOR >
+class GenericDocument : public GenericValue<Encoding, Allocator> {
+public:
+    typedef typename Encoding::Ch Ch;                       //!< Character type derived from Encoding.
+    typedef GenericValue<Encoding, Allocator> ValueType;    //!< Value type of the document.
+    typedef Allocator AllocatorType;                        //!< Allocator type from template parameter.
+    typedef StackAllocator StackAllocatorType;              //!< StackAllocator type from template parameter.
+
+    //! Constructor
+    /*! Creates an empty document of specified type.
+        \param type             Mandatory type of object to create.
+        \param allocator        Optional allocator for allocating memory.
+        \param stackCapacity    Optional initial capacity of stack in bytes.
+        \param stackAllocator   Optional allocator for allocating memory for stack.
+    */
+    explicit GenericDocument(Type type, Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity, StackAllocator* stackAllocator = 0) :
+        GenericValue<Encoding, Allocator>(type),  allocator_(allocator), ownAllocator_(0), stack_(stackAllocator, stackCapacity), parseResult_()
+    {
+        if (!allocator_)
+            ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)();
+    }
+
+    //! Constructor
+    /*! Creates an empty document which type is Null. 
+        \param allocator        Optional allocator for allocating memory.
+        \param stackCapacity    Optional initial capacity of stack in bytes.
+        \param stackAllocator   Optional allocator for allocating memory for stack.
+    */
+    GenericDocument(Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity, StackAllocator* stackAllocator = 0) : 
+        allocator_(allocator), ownAllocator_(0), stack_(stackAllocator, stackCapacity), parseResult_()
+    {
+        if (!allocator_)
+            ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)();
+    }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    //! Move constructor in C++11
+    GenericDocument(GenericDocument&& rhs) RAPIDJSON_NOEXCEPT
+        : ValueType(std::forward<ValueType>(rhs)), // explicit cast to avoid prohibited move from Document
+          allocator_(rhs.allocator_),
+          ownAllocator_(rhs.ownAllocator_),
+          stack_(std::move(rhs.stack_)),
+          parseResult_(rhs.parseResult_)
+    {
+        rhs.allocator_ = 0;
+        rhs.ownAllocator_ = 0;
+        rhs.parseResult_ = ParseResult();
+    }
+#endif
+
+    ~GenericDocument() {
+        // Clear the ::ValueType before ownAllocator is destroyed, ~ValueType()
+        // runs last and may access its elements or members which would be freed
+        // with an allocator like MemoryPoolAllocator (CrtAllocator does not
+        // free its data when destroyed, but MemoryPoolAllocator does).
+        if (ownAllocator_) {
+            ValueType::SetNull();
+        }
+        Destroy();
+    }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    //! Move assignment in C++11
+    GenericDocument& operator=(GenericDocument&& rhs) RAPIDJSON_NOEXCEPT
+    {
+        // The cast to ValueType is necessary here, because otherwise it would
+        // attempt to call GenericValue's templated assignment operator.
+        ValueType::operator=(std::forward<ValueType>(rhs));
+
+        // Calling the destructor here would prematurely call stack_'s destructor
+        Destroy();
+
+        allocator_ = rhs.allocator_;
+        ownAllocator_ = rhs.ownAllocator_;
+        stack_ = std::move(rhs.stack_);
+        parseResult_ = rhs.parseResult_;
+
+        rhs.allocator_ = 0;
+        rhs.ownAllocator_ = 0;
+        rhs.parseResult_ = ParseResult();
+
+        return *this;
+    }
+#endif
+
+    //! Exchange the contents of this document with those of another.
+    /*!
+        \param rhs Another document.
+        \note Constant complexity.
+        \see GenericValue::Swap
+    */
+    GenericDocument& Swap(GenericDocument& rhs) RAPIDJSON_NOEXCEPT {
+        ValueType::Swap(rhs);
+        stack_.Swap(rhs.stack_);
+        internal::Swap(allocator_, rhs.allocator_);
+        internal::Swap(ownAllocator_, rhs.ownAllocator_);
+        internal::Swap(parseResult_, rhs.parseResult_);
+        return *this;
+    }
+
+    // Allow Swap with ValueType.
+    // Refer to Effective C++ 3rd Edition/Item 33: Avoid hiding inherited names.
+    using ValueType::Swap;
+
+    //! free-standing swap function helper
+    /*!
+        Helper function to enable support for common swap implementation pattern based on \c std::swap:
+        \code
+        void swap(MyClass& a, MyClass& b) {
+            using std::swap;
+            swap(a.doc, b.doc);
+            // ...
+        }
+        \endcode
+        \see Swap()
+     */
+    friend inline void swap(GenericDocument& a, GenericDocument& b) RAPIDJSON_NOEXCEPT { a.Swap(b); }
+
+    //! Populate this document by a generator which produces SAX events.
+    /*! \tparam Generator A functor with <tt>bool f(Handler)</tt> prototype.
+        \param g Generator functor which sends SAX events to the parameter.
+        \return The document itself for fluent API.
+    */
+    template <typename Generator>
+    GenericDocument& Populate(Generator& g) {
+        ClearStackOnExit scope(*this);
+        if (g(*this)) {
+            RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object
+            ValueType::operator=(*stack_.template Pop<ValueType>(1));// Move value from stack to document
+        }
+        return *this;
+    }
+
+    //!@name Parse from stream
+    //!@{
+
+    //! Parse JSON text from an input stream (with Encoding conversion)
+    /*! \tparam parseFlags Combination of \ref ParseFlag.
+        \tparam SourceEncoding Encoding of input stream
+        \tparam InputStream Type of input stream, implementing Stream concept
+        \param is Input stream to be parsed.
+        \return The document itself for fluent API.
+    */
+    template <unsigned parseFlags, typename SourceEncoding, typename InputStream>
+    GenericDocument& ParseStream(InputStream& is) {
+        GenericReader<SourceEncoding, Encoding, StackAllocator> reader(
+            stack_.HasAllocator() ? &stack_.GetAllocator() : 0);
+        ClearStackOnExit scope(*this);
+        parseResult_ = reader.template Parse<parseFlags>(is, *this);
+        if (parseResult_) {
+            RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object
+            ValueType::operator=(*stack_.template Pop<ValueType>(1));// Move value from stack to document
+        }
+        return *this;
+    }
+
+    //! Parse JSON text from an input stream
+    /*! \tparam parseFlags Combination of \ref ParseFlag.
+        \tparam InputStream Type of input stream, implementing Stream concept
+        \param is Input stream to be parsed.
+        \return The document itself for fluent API.
+    */
+    template <unsigned parseFlags, typename InputStream>
+    GenericDocument& ParseStream(InputStream& is) {
+        return ParseStream<parseFlags, Encoding, InputStream>(is);
+    }
+
+    //! Parse JSON text from an input stream (with \ref kParseDefaultFlags)
+    /*! \tparam InputStream Type of input stream, implementing Stream concept
+        \param is Input stream to be parsed.
+        \return The document itself for fluent API.
+    */
+    template <typename InputStream>
+    GenericDocument& ParseStream(InputStream& is) {
+        return ParseStream<kParseDefaultFlags, Encoding, InputStream>(is);
+    }
+    //!@}
+
+    //!@name Parse in-place from mutable string
+    //!@{
+
+    //! Parse JSON text from a mutable string
+    /*! \tparam parseFlags Combination of \ref ParseFlag.
+        \param str Mutable zero-terminated string to be parsed.
+        \return The document itself for fluent API.
+    */
+    template <unsigned parseFlags>
+    GenericDocument& ParseInsitu(Ch* str) {
+        GenericInsituStringStream<Encoding> s(str);
+        return ParseStream<parseFlags | kParseInsituFlag>(s);
+    }
+
+    //! Parse JSON text from a mutable string (with \ref kParseDefaultFlags)
+    /*! \param str Mutable zero-terminated string to be parsed.
+        \return The document itself for fluent API.
+    */
+    GenericDocument& ParseInsitu(Ch* str) {
+        return ParseInsitu<kParseDefaultFlags>(str);
+    }
+    //!@}
+
+    //!@name Parse from read-only string
+    //!@{
+
+    //! Parse JSON text from a read-only string (with Encoding conversion)
+    /*! \tparam parseFlags Combination of \ref ParseFlag (must not contain \ref kParseInsituFlag).
+        \tparam SourceEncoding Transcoding from input Encoding
+        \param str Read-only zero-terminated string to be parsed.
+    */
+    template <unsigned parseFlags, typename SourceEncoding>
+    GenericDocument& Parse(const typename SourceEncoding::Ch* str) {
+        RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag));
+        GenericStringStream<SourceEncoding> s(str);
+        return ParseStream<parseFlags, SourceEncoding>(s);
+    }
+
+    //! Parse JSON text from a read-only string
+    /*! \tparam parseFlags Combination of \ref ParseFlag (must not contain \ref kParseInsituFlag).
+        \param str Read-only zero-terminated string to be parsed.
+    */
+    template <unsigned parseFlags>
+    GenericDocument& Parse(const Ch* str) {
+        return Parse<parseFlags, Encoding>(str);
+    }
+
+    //! Parse JSON text from a read-only string (with \ref kParseDefaultFlags)
+    /*! \param str Read-only zero-terminated string to be parsed.
+    */
+    GenericDocument& Parse(const Ch* str) {
+        return Parse<kParseDefaultFlags>(str);
+    }
+
+    template <unsigned parseFlags, typename SourceEncoding>
+    GenericDocument& Parse(const typename SourceEncoding::Ch* str, size_t length) {
+        RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag));
+        MemoryStream ms(reinterpret_cast<const char*>(str), length * sizeof(typename SourceEncoding::Ch));
+        EncodedInputStream<SourceEncoding, MemoryStream> is(ms);
+        ParseStream<parseFlags, SourceEncoding>(is);
+        return *this;
+    }
+
+    template <unsigned parseFlags>
+    GenericDocument& Parse(const Ch* str, size_t length) {
+        return Parse<parseFlags, Encoding>(str, length);
+    }
+    
+    GenericDocument& Parse(const Ch* str, size_t length) {
+        return Parse<kParseDefaultFlags>(str, length);
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    template <unsigned parseFlags, typename SourceEncoding>
+    GenericDocument& Parse(const std::basic_string<typename SourceEncoding::Ch>& str) {
+        // c_str() is constant complexity according to standard. Should be faster than Parse(const char*, size_t)
+        return Parse<parseFlags, SourceEncoding>(str.c_str());
+    }
+
+    template <unsigned parseFlags>
+    GenericDocument& Parse(const std::basic_string<Ch>& str) {
+        return Parse<parseFlags, Encoding>(str.c_str());
+    }
+
+    GenericDocument& Parse(const std::basic_string<Ch>& str) {
+        return Parse<kParseDefaultFlags>(str);
+    }
+#endif // RAPIDJSON_HAS_STDSTRING    
+
+    //!@}
+
+    //!@name Handling parse errors
+    //!@{
+
+    //! Whether a parse error has occurred in the last parsing.
+    bool HasParseError() const { return parseResult_.IsError(); }
+
+    //! Get the \ref ParseErrorCode of last parsing.
+    ParseErrorCode GetParseError() const { return parseResult_.Code(); }
+
+    //! Get the position of last parsing error in input, 0 otherwise.
+    size_t GetErrorOffset() const { return parseResult_.Offset(); }
+
+    //! Implicit conversion to get the last parse result
+#ifndef __clang // -Wdocumentation
+    /*! \return \ref ParseResult of the last parse operation
+
+        \code
+          Document doc;
+          ParseResult ok = doc.Parse(json);
+          if (!ok)
+            printf( "JSON parse error: %s (%u)\n", GetParseError_En(ok.Code()), ok.Offset());
+        \endcode
+     */
+#endif
+    operator ParseResult() const { return parseResult_; }
+    //!@}
+
+    //! Get the allocator of this document.
+    Allocator& GetAllocator() {
+        RAPIDJSON_ASSERT(allocator_);
+        return *allocator_;
+    }
+
+    //! Get the capacity of stack in bytes.
+    size_t GetStackCapacity() const { return stack_.GetCapacity(); }
+
+private:
+    // clear stack on any exit from ParseStream, e.g. due to exception
+    struct ClearStackOnExit {
+        explicit ClearStackOnExit(GenericDocument& d) : d_(d) {}
+        ~ClearStackOnExit() { d_.ClearStack(); }
+    private:
+        ClearStackOnExit(const ClearStackOnExit&);
+        ClearStackOnExit& operator=(const ClearStackOnExit&);
+        GenericDocument& d_;
+    };
+
+    // callers of the following private Handler functions
+    // template <typename,typename,typename> friend class GenericReader; // for parsing
+    template <typename, typename> friend class GenericValue; // for deep copying
+
+public:
+    // Implementation of Handler
+    bool Null() { new (stack_.template Push<ValueType>()) ValueType(); return true; }
+    bool Bool(bool b) { new (stack_.template Push<ValueType>()) ValueType(b); return true; }
+    bool Int(int i) { new (stack_.template Push<ValueType>()) ValueType(i); return true; }
+    bool Uint(unsigned i) { new (stack_.template Push<ValueType>()) ValueType(i); return true; }
+    bool Int64(int64_t i) { new (stack_.template Push<ValueType>()) ValueType(i); return true; }
+    bool Uint64(uint64_t i) { new (stack_.template Push<ValueType>()) ValueType(i); return true; }
+    bool Double(double d) { new (stack_.template Push<ValueType>()) ValueType(d); return true; }
+
+    bool RawNumber(const Ch* str, SizeType length, bool copy) { 
+        if (copy) 
+            new (stack_.template Push<ValueType>()) ValueType(str, length, GetAllocator());
+        else
+            new (stack_.template Push<ValueType>()) ValueType(str, length);
+        return true;
+    }
+
+    bool String(const Ch* str, SizeType length, bool copy) { 
+        if (copy) 
+            new (stack_.template Push<ValueType>()) ValueType(str, length, GetAllocator());
+        else
+            new (stack_.template Push<ValueType>()) ValueType(str, length);
+        return true;
+    }
+
+    bool StartObject() { new (stack_.template Push<ValueType>()) ValueType(kObjectType); return true; }
+    
+    bool Key(const Ch* str, SizeType length, bool copy) { return String(str, length, copy); }
+
+    bool EndObject(SizeType memberCount) {
+        typename ValueType::Member* members = stack_.template Pop<typename ValueType::Member>(memberCount);
+        stack_.template Top<ValueType>()->SetObjectRaw(members, memberCount, GetAllocator());
+        return true;
+    }
+
+    bool StartArray() { new (stack_.template Push<ValueType>()) ValueType(kArrayType); return true; }
+    
+    bool EndArray(SizeType elementCount) {
+        ValueType* elements = stack_.template Pop<ValueType>(elementCount);
+        stack_.template Top<ValueType>()->SetArrayRaw(elements, elementCount, GetAllocator());
+        return true;
+    }
+
+private:
+    //! Prohibit copying
+    GenericDocument(const GenericDocument&);
+    //! Prohibit assignment
+    GenericDocument& operator=(const GenericDocument&);
+
+    void ClearStack() {
+        if (Allocator::kNeedFree)
+            while (stack_.GetSize() > 0)    // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects)
+                (stack_.template Pop<ValueType>(1))->~ValueType();
+        else
+            stack_.Clear();
+        stack_.ShrinkToFit();
+    }
+
+    void Destroy() {
+        RAPIDJSON_DELETE(ownAllocator_);
+    }
+
+    static const size_t kDefaultStackCapacity = 1024;
+    Allocator* allocator_;
+    Allocator* ownAllocator_;
+    internal::Stack<StackAllocator> stack_;
+    ParseResult parseResult_;
+};
+
+//! GenericDocument with UTF8 encoding
+typedef GenericDocument<UTF8<> > Document;
+
+
+//! Helper class for accessing Value of array type.
+/*!
+    Instance of this helper class is obtained by \c GenericValue::GetArray().
+    In addition to all APIs for array type, it provides range-based for loop if \c RAPIDJSON_HAS_CXX11_RANGE_FOR=1.
+*/
+template <bool Const, typename ValueT>
+class GenericArray {
+public:
+    typedef GenericArray<true, ValueT> ConstArray;
+    typedef GenericArray<false, ValueT> Array;
+    typedef ValueT PlainType;
+    typedef typename internal::MaybeAddConst<Const,PlainType>::Type ValueType;
+    typedef ValueType* ValueIterator;  // This may be const or non-const iterator
+    typedef const ValueT* ConstValueIterator;
+    typedef typename ValueType::AllocatorType AllocatorType;
+    typedef typename ValueType::StringRefType StringRefType;
+
+    template <typename, typename>
+    friend class GenericValue;
+
+    GenericArray(const GenericArray& rhs) : value_(rhs.value_) {}
+    GenericArray& operator=(const GenericArray& rhs) { value_ = rhs.value_; return *this; }
+    ~GenericArray() {}
+
+    operator ValueType&() const { return value_; }
+    SizeType Size() const { return value_.Size(); }
+    SizeType Capacity() const { return value_.Capacity(); }
+    bool Empty() const { return value_.Empty(); }
+    void Clear() const { value_.Clear(); }
+    ValueType& operator[](SizeType index) const {  return value_[index]; }
+    ValueIterator Begin() const { return value_.Begin(); }
+    ValueIterator End() const { return value_.End(); }
+    GenericArray Reserve(SizeType newCapacity, AllocatorType &allocator) const { value_.Reserve(newCapacity, allocator); return *this; }
+    GenericArray PushBack(ValueType& value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; }
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    GenericArray PushBack(ValueType&& value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; }
+#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    GenericArray PushBack(StringRefType value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; }
+    template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (const GenericArray&)) PushBack(T value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; }
+    GenericArray PopBack() const { value_.PopBack(); return *this; }
+    ValueIterator Erase(ConstValueIterator pos) const { return value_.Erase(pos); }
+    ValueIterator Erase(ConstValueIterator first, ConstValueIterator last) const { return value_.Erase(first, last); }
+
+#if RAPIDJSON_HAS_CXX11_RANGE_FOR
+    ValueIterator begin() const { return value_.Begin(); }
+    ValueIterator end() const { return value_.End(); }
+#endif
+
+private:
+    GenericArray();
+    GenericArray(ValueType& value) : value_(value) {}
+    ValueType& value_;
+};
+
+//! Helper class for accessing Value of object type.
+/*!
+    Instance of this helper class is obtained by \c GenericValue::GetObject().
+    In addition to all APIs for array type, it provides range-based for loop if \c RAPIDJSON_HAS_CXX11_RANGE_FOR=1.
+*/
+template <bool Const, typename ValueT>
+class GenericObject {
+public:
+    typedef GenericObject<true, ValueT> ConstObject;
+    typedef GenericObject<false, ValueT> Object;
+    typedef ValueT PlainType;
+    typedef typename internal::MaybeAddConst<Const,PlainType>::Type ValueType;
+    typedef GenericMemberIterator<Const, typename ValueT::EncodingType, typename ValueT::AllocatorType> MemberIterator;  // This may be const or non-const iterator
+    typedef GenericMemberIterator<true, typename ValueT::EncodingType, typename ValueT::AllocatorType> ConstMemberIterator;
+    typedef typename ValueType::AllocatorType AllocatorType;
+    typedef typename ValueType::StringRefType StringRefType;
+    typedef typename ValueType::EncodingType EncodingType;
+    typedef typename ValueType::Ch Ch;
+
+    template <typename, typename>
+    friend class GenericValue;
+
+    GenericObject(const GenericObject& rhs) : value_(rhs.value_) {}
+    GenericObject& operator=(const GenericObject& rhs) { value_ = rhs.value_; return *this; }
+    ~GenericObject() {}
+
+    operator ValueType&() const { return value_; }
+    SizeType MemberCount() const { return value_.MemberCount(); }
+    SizeType MemberCapacity() const { return value_.MemberCapacity(); }
+    bool ObjectEmpty() const { return value_.ObjectEmpty(); }
+    template <typename T> ValueType& operator[](T* name) const { return value_[name]; }
+    template <typename SourceAllocator> ValueType& operator[](const GenericValue<EncodingType, SourceAllocator>& name) const { return value_[name]; }
+#if RAPIDJSON_HAS_STDSTRING
+    ValueType& operator[](const std::basic_string<Ch>& name) const { return value_[name]; }
+#endif
+    MemberIterator MemberBegin() const { return value_.MemberBegin(); }
+    MemberIterator MemberEnd() const { return value_.MemberEnd(); }
+    GenericObject MemberReserve(SizeType newCapacity, AllocatorType &allocator) const { value_.MemberReserve(newCapacity, allocator); return *this; }
+    bool HasMember(const Ch* name) const { return value_.HasMember(name); }
+#if RAPIDJSON_HAS_STDSTRING
+    bool HasMember(const std::basic_string<Ch>& name) const { return value_.HasMember(name); }
+#endif
+    template <typename SourceAllocator> bool HasMember(const GenericValue<EncodingType, SourceAllocator>& name) const { return value_.HasMember(name); }
+    MemberIterator FindMember(const Ch* name) const { return value_.FindMember(name); }
+    template <typename SourceAllocator> MemberIterator FindMember(const GenericValue<EncodingType, SourceAllocator>& name) const { return value_.FindMember(name); }
+#if RAPIDJSON_HAS_STDSTRING
+    MemberIterator FindMember(const std::basic_string<Ch>& name) const { return value_.FindMember(name); }
+#endif
+    GenericObject AddMember(ValueType& name, ValueType& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+    GenericObject AddMember(ValueType& name, StringRefType value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+#if RAPIDJSON_HAS_STDSTRING
+    GenericObject AddMember(ValueType& name, std::basic_string<Ch>& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+#endif
+    template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (ValueType&)) AddMember(ValueType& name, T value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    GenericObject AddMember(ValueType&& name, ValueType&& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+    GenericObject AddMember(ValueType&& name, ValueType& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+    GenericObject AddMember(ValueType& name, ValueType&& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+    GenericObject AddMember(StringRefType name, ValueType&& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    GenericObject AddMember(StringRefType name, ValueType& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+    GenericObject AddMember(StringRefType name, StringRefType value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+    template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (GenericObject)) AddMember(StringRefType name, T value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; }
+    void RemoveAllMembers() { value_.RemoveAllMembers(); }
+    bool RemoveMember(const Ch* name) const { return value_.RemoveMember(name); }
+#if RAPIDJSON_HAS_STDSTRING
+    bool RemoveMember(const std::basic_string<Ch>& name) const { return value_.RemoveMember(name); }
+#endif
+    template <typename SourceAllocator> bool RemoveMember(const GenericValue<EncodingType, SourceAllocator>& name) const { return value_.RemoveMember(name); }
+    MemberIterator RemoveMember(MemberIterator m) const { return value_.RemoveMember(m); }
+    MemberIterator EraseMember(ConstMemberIterator pos) const { return value_.EraseMember(pos); }
+    MemberIterator EraseMember(ConstMemberIterator first, ConstMemberIterator last) const { return value_.EraseMember(first, last); }
+    bool EraseMember(const Ch* name) const { return value_.EraseMember(name); }
+#if RAPIDJSON_HAS_STDSTRING
+    bool EraseMember(const std::basic_string<Ch>& name) const { return EraseMember(ValueType(StringRef(name))); }
+#endif
+    template <typename SourceAllocator> bool EraseMember(const GenericValue<EncodingType, SourceAllocator>& name) const { return value_.EraseMember(name); }
+
+#if RAPIDJSON_HAS_CXX11_RANGE_FOR
+    MemberIterator begin() const { return value_.MemberBegin(); }
+    MemberIterator end() const { return value_.MemberEnd(); }
+#endif
+
+private:
+    GenericObject();
+    GenericObject(ValueType& value) : value_(value) {}
+    ValueType& value_;
+};
+
+RAPIDJSON_NAMESPACE_END
+RAPIDJSON_DIAG_POP
+
+#ifdef RAPIDJSON_WINDOWS_GETOBJECT_WORKAROUND_APPLIED
+#pragma pop_macro("GetObject")
+#undef RAPIDJSON_WINDOWS_GETOBJECT_WORKAROUND_APPLIED
+#endif
+
+#endif // RAPIDJSON_DOCUMENT_H_
diff --git a/include/rapidjson/encodedstream.h b/include/rapidjson/encodedstream.h
new file mode 100644
index 0000000000..cf046b8923
--- /dev/null
+++ b/include/rapidjson/encodedstream.h
@@ -0,0 +1,299 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_ENCODEDSTREAM_H_
+#define RAPIDJSON_ENCODEDSTREAM_H_
+
+#include "stream.h"
+#include "memorystream.h"
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++)
+#endif
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(padded)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Input byte stream wrapper with a statically bound encoding.
+/*!
+    \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
+    \tparam InputByteStream Type of input byte stream. For example, FileReadStream.
+*/
+template <typename Encoding, typename InputByteStream>
+class EncodedInputStream {
+    RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+public:
+    typedef typename Encoding::Ch Ch;
+
+    EncodedInputStream(InputByteStream& is) : is_(is) { 
+        current_ = Encoding::TakeBOM(is_);
+    }
+
+    Ch Peek() const { return current_; }
+    Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; }
+    size_t Tell() const { return is_.Tell(); }
+
+    // Not implemented
+    void Put(Ch) { RAPIDJSON_ASSERT(false); }
+    void Flush() { RAPIDJSON_ASSERT(false); } 
+    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
+
+private:
+    EncodedInputStream(const EncodedInputStream&);
+    EncodedInputStream& operator=(const EncodedInputStream&);
+
+    InputByteStream& is_;
+    Ch current_;
+};
+
+//! Specialized for UTF8 MemoryStream.
+template <>
+class EncodedInputStream<UTF8<>, MemoryStream> {
+public:
+    typedef UTF8<>::Ch Ch;
+
+    EncodedInputStream(MemoryStream& is) : is_(is) {
+        if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take();
+        if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take();
+        if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take();
+    }
+    Ch Peek() const { return is_.Peek(); }
+    Ch Take() { return is_.Take(); }
+    size_t Tell() const { return is_.Tell(); }
+
+    // Not implemented
+    void Put(Ch) {}
+    void Flush() {} 
+    Ch* PutBegin() { return 0; }
+    size_t PutEnd(Ch*) { return 0; }
+
+    MemoryStream& is_;
+
+private:
+    EncodedInputStream(const EncodedInputStream&);
+    EncodedInputStream& operator=(const EncodedInputStream&);
+};
+
+//! Output byte stream wrapper with statically bound encoding.
+/*!
+    \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
+    \tparam OutputByteStream Type of input byte stream. For example, FileWriteStream.
+*/
+template <typename Encoding, typename OutputByteStream>
+class EncodedOutputStream {
+    RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+public:
+    typedef typename Encoding::Ch Ch;
+
+    EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) { 
+        if (putBOM)
+            Encoding::PutBOM(os_);
+    }
+
+    void Put(Ch c) { Encoding::Put(os_, c);  }
+    void Flush() { os_.Flush(); }
+
+    // Not implemented
+    Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;}
+    Ch Take() { RAPIDJSON_ASSERT(false); return 0;}
+    size_t Tell() const { RAPIDJSON_ASSERT(false);  return 0; }
+    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
+
+private:
+    EncodedOutputStream(const EncodedOutputStream&);
+    EncodedOutputStream& operator=(const EncodedOutputStream&);
+
+    OutputByteStream& os_;
+};
+
+#define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
+
+//! Input stream wrapper with dynamically bound encoding and automatic encoding detection.
+/*!
+    \tparam CharType Type of character for reading.
+    \tparam InputByteStream type of input byte stream to be wrapped.
+*/
+template <typename CharType, typename InputByteStream>
+class AutoUTFInputStream {
+    RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+public:
+    typedef CharType Ch;
+
+    //! Constructor.
+    /*!
+        \param is input stream to be wrapped.
+        \param type UTF encoding type if it is not detected from the stream.
+    */
+    AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
+        RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);        
+        DetectType();
+        static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
+        takeFunc_ = f[type_];
+        current_ = takeFunc_(*is_);
+    }
+
+    UTFType GetType() const { return type_; }
+    bool HasBOM() const { return hasBOM_; }
+
+    Ch Peek() const { return current_; }
+    Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; }
+    size_t Tell() const { return is_->Tell(); }
+
+    // Not implemented
+    void Put(Ch) { RAPIDJSON_ASSERT(false); }
+    void Flush() { RAPIDJSON_ASSERT(false); } 
+    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
+
+private:
+    AutoUTFInputStream(const AutoUTFInputStream&);
+    AutoUTFInputStream& operator=(const AutoUTFInputStream&);
+
+    // Detect encoding type with BOM or RFC 4627
+    void DetectType() {
+        // BOM (Byte Order Mark):
+        // 00 00 FE FF  UTF-32BE
+        // FF FE 00 00  UTF-32LE
+        // FE FF        UTF-16BE
+        // FF FE        UTF-16LE
+        // EF BB BF     UTF-8
+
+        const unsigned char* c = reinterpret_cast<const unsigned char *>(is_->Peek4());
+        if (!c)
+            return;
+
+        unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24));
+        hasBOM_ = false;
+        if (bom == 0xFFFE0000)                  { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
+        else if (bom == 0x0000FEFF)             { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
+        else if ((bom & 0xFFFF) == 0xFFFE)      { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take();                           }
+        else if ((bom & 0xFFFF) == 0xFEFF)      { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take();                           }
+        else if ((bom & 0xFFFFFF) == 0xBFBBEF)  { type_ = kUTF8;    hasBOM_ = true; is_->Take(); is_->Take(); is_->Take();              }
+
+        // RFC 4627: Section 3
+        // "Since the first two characters of a JSON text will always be ASCII
+        // characters [RFC0020], it is possible to determine whether an octet
+        // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
+        // at the pattern of nulls in the first four octets."
+        // 00 00 00 xx  UTF-32BE
+        // 00 xx 00 xx  UTF-16BE
+        // xx 00 00 00  UTF-32LE
+        // xx 00 xx 00  UTF-16LE
+        // xx xx xx xx  UTF-8
+
+        if (!hasBOM_) {
+            int pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
+            switch (pattern) {
+            case 0x08: type_ = kUTF32BE; break;
+            case 0x0A: type_ = kUTF16BE; break;
+            case 0x01: type_ = kUTF32LE; break;
+            case 0x05: type_ = kUTF16LE; break;
+            case 0x0F: type_ = kUTF8;    break;
+            default: break; // Use type defined by user.
+            }
+        }
+
+        // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
+        if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
+        if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
+    }
+
+    typedef Ch (*TakeFunc)(InputByteStream& is);
+    InputByteStream* is_;
+    UTFType type_;
+    Ch current_;
+    TakeFunc takeFunc_;
+    bool hasBOM_;
+};
+
+//! Output stream wrapper with dynamically bound encoding and automatic encoding detection.
+/*!
+    \tparam CharType Type of character for writing.
+    \tparam OutputByteStream type of output byte stream to be wrapped.
+*/
+template <typename CharType, typename OutputByteStream>
+class AutoUTFOutputStream {
+    RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+public:
+    typedef CharType Ch;
+
+    //! Constructor.
+    /*!
+        \param os output stream to be wrapped.
+        \param type UTF encoding type.
+        \param putBOM Whether to write BOM at the beginning of the stream.
+    */
+    AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) {
+        RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
+
+        // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
+        if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
+        if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
+
+        static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) };
+        putFunc_ = f[type_];
+
+        if (putBOM)
+            PutBOM();
+    }
+
+    UTFType GetType() const { return type_; }
+
+    void Put(Ch c) { putFunc_(*os_, c); }
+    void Flush() { os_->Flush(); } 
+
+    // Not implemented
+    Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;}
+    Ch Take() { RAPIDJSON_ASSERT(false); return 0;}
+    size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
+    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
+
+private:
+    AutoUTFOutputStream(const AutoUTFOutputStream&);
+    AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
+
+    void PutBOM() { 
+        typedef void (*PutBOMFunc)(OutputByteStream&);
+        static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
+        f[type_](*os_);
+    }
+
+    typedef void (*PutFunc)(OutputByteStream&, Ch);
+
+    OutputByteStream* os_;
+    UTFType type_;
+    PutFunc putFunc_;
+};
+
+#undef RAPIDJSON_ENCODINGS_FUNC
+
+RAPIDJSON_NAMESPACE_END
+
+#ifdef __clang__
+RAPIDJSON_DIAG_POP
+#endif
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_FILESTREAM_H_
diff --git a/include/rapidjson/encodings.h b/include/rapidjson/encodings.h
new file mode 100644
index 0000000000..c453c0da31
--- /dev/null
+++ b/include/rapidjson/encodings.h
@@ -0,0 +1,716 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_ENCODINGS_H_
+#define RAPIDJSON_ENCODINGS_H_
+
+#include "rapidjson.h"
+
+#if defined(_MSC_VER) && !defined(__clang__)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(4244) // conversion from 'type1' to 'type2', possible loss of data
+RAPIDJSON_DIAG_OFF(4702)  // unreachable code
+#elif defined(__GNUC__)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++)
+RAPIDJSON_DIAG_OFF(overflow)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+// Encoding
+
+/*! \class rapidjson::Encoding
+    \brief Concept for encoding of Unicode characters.
+
+\code
+concept Encoding {
+    typename Ch;    //! Type of character. A "character" is actually a code unit in unicode's definition.
+
+    enum { supportUnicode = 1 }; // or 0 if not supporting unicode
+
+    //! \brief Encode a Unicode codepoint to an output stream.
+    //! \param os Output stream.
+    //! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively.
+    template<typename OutputStream>
+    static void Encode(OutputStream& os, unsigned codepoint);
+
+    //! \brief Decode a Unicode codepoint from an input stream.
+    //! \param is Input stream.
+    //! \param codepoint Output of the unicode codepoint.
+    //! \return true if a valid codepoint can be decoded from the stream.
+    template <typename InputStream>
+    static bool Decode(InputStream& is, unsigned* codepoint);
+
+    //! \brief Validate one Unicode codepoint from an encoded stream.
+    //! \param is Input stream to obtain codepoint.
+    //! \param os Output for copying one codepoint.
+    //! \return true if it is valid.
+    //! \note This function just validating and copying the codepoint without actually decode it.
+    template <typename InputStream, typename OutputStream>
+    static bool Validate(InputStream& is, OutputStream& os);
+
+    // The following functions are deal with byte streams.
+
+    //! Take a character from input byte stream, skip BOM if exist.
+    template <typename InputByteStream>
+    static CharType TakeBOM(InputByteStream& is);
+
+    //! Take a character from input byte stream.
+    template <typename InputByteStream>
+    static Ch Take(InputByteStream& is);
+
+    //! Put BOM to output byte stream.
+    template <typename OutputByteStream>
+    static void PutBOM(OutputByteStream& os);
+
+    //! Put a character to output byte stream.
+    template <typename OutputByteStream>
+    static void Put(OutputByteStream& os, Ch c);
+};
+\endcode
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+// UTF8
+
+//! UTF-8 encoding.
+/*! http://en.wikipedia.org/wiki/UTF-8
+    http://tools.ietf.org/html/rfc3629
+    \tparam CharType Code unit for storing 8-bit UTF-8 data. Default is char.
+    \note implements Encoding concept
+*/
+template<typename CharType = char>
+struct UTF8 {
+    typedef CharType Ch;
+
+    enum { supportUnicode = 1 };
+
+    template<typename OutputStream>
+    static void Encode(OutputStream& os, unsigned codepoint) {
+        if (codepoint <= 0x7F) 
+            os.Put(static_cast<Ch>(codepoint & 0xFF));
+        else if (codepoint <= 0x7FF) {
+            os.Put(static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
+            os.Put(static_cast<Ch>(0x80 | ((codepoint & 0x3F))));
+        }
+        else if (codepoint <= 0xFFFF) {
+            os.Put(static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF)));
+            os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
+            os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F)));
+        }
+        else {
+            RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
+            os.Put(static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF)));
+            os.Put(static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F)));
+            os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
+            os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F)));
+        }
+    }
+
+    template<typename OutputStream>
+    static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
+        if (codepoint <= 0x7F) 
+            PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF));
+        else if (codepoint <= 0x7FF) {
+            PutUnsafe(os, static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
+            PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint & 0x3F))));
+        }
+        else if (codepoint <= 0xFFFF) {
+            PutUnsafe(os, static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF)));
+            PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
+            PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F)));
+        }
+        else {
+            RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
+            PutUnsafe(os, static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF)));
+            PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F)));
+            PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
+            PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F)));
+        }
+    }
+
+    template <typename InputStream>
+    static bool Decode(InputStream& is, unsigned* codepoint) {
+#define RAPIDJSON_COPY() c = is.Take(); *codepoint = (*codepoint << 6) | (static_cast<unsigned char>(c) & 0x3Fu)
+#define RAPIDJSON_TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
+#define RAPIDJSON_TAIL() RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x70)
+        typename InputStream::Ch c = is.Take();
+        if (!(c & 0x80)) {
+            *codepoint = static_cast<unsigned char>(c);
+            return true;
+        }
+
+        unsigned char type = GetRange(static_cast<unsigned char>(c));
+        if (type >= 32) {
+            *codepoint = 0;
+        } else {
+            *codepoint = (0xFFu >> type) & static_cast<unsigned char>(c);
+        }
+        bool result = true;
+        switch (type) {
+        case 2: RAPIDJSON_TAIL(); return result;
+        case 3: RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result;
+        case 4: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x50); RAPIDJSON_TAIL(); return result;
+        case 5: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x10); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result;
+        case 6: RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result;
+        case 10: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x20); RAPIDJSON_TAIL(); return result;
+        case 11: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x60); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result;
+        default: return false;
+        }
+#undef RAPIDJSON_COPY
+#undef RAPIDJSON_TRANS
+#undef RAPIDJSON_TAIL
+    }
+
+    template <typename InputStream, typename OutputStream>
+    static bool Validate(InputStream& is, OutputStream& os) {
+#define RAPIDJSON_COPY() if (c != '\0') os.Put(c = is.Take())
+#define RAPIDJSON_TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
+#define RAPIDJSON_TAIL() RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x70)
+        Ch c = static_cast<Ch>(-1);
+        RAPIDJSON_COPY();
+        if (!(c & 0x80))
+            return true;
+
+        bool result = true;
+        switch (GetRange(static_cast<unsigned char>(c))) {
+        case 2: RAPIDJSON_TAIL(); return result;
+        case 3: RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result;
+        case 4: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x50); RAPIDJSON_TAIL(); return result;
+        case 5: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x10); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result;
+        case 6: RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result;
+        case 10: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x20); RAPIDJSON_TAIL(); return result;
+        case 11: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x60); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result;
+        default: return false;
+        }
+#undef RAPIDJSON_COPY
+#undef RAPIDJSON_TRANS
+#undef RAPIDJSON_TAIL
+    }
+
+    static unsigned char GetRange(unsigned char c) {
+        // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+        // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
+        static const unsigned char type[] = {
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
+            0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
+            0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
+            0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
+            8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+            10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+        };
+        return type[c];
+    }
+
+    template <typename InputByteStream>
+    static CharType TakeBOM(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        typename InputByteStream::Ch c = Take(is);
+        if (static_cast<unsigned char>(c) != 0xEFu) return c;
+        c = is.Take();
+        if (static_cast<unsigned char>(c) != 0xBBu) return c;
+        c = is.Take();
+        if (static_cast<unsigned char>(c) != 0xBFu) return c;
+        c = is.Take();
+        return c;
+    }
+
+    template <typename InputByteStream>
+    static Ch Take(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        return static_cast<Ch>(is.Take());
+    }
+
+    template <typename OutputByteStream>
+    static void PutBOM(OutputByteStream& os) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xEFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xBBu));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xBFu));
+    }
+
+    template <typename OutputByteStream>
+    static void Put(OutputByteStream& os, Ch c) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>(c));
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// UTF16
+
+//! UTF-16 encoding.
+/*! http://en.wikipedia.org/wiki/UTF-16
+    http://tools.ietf.org/html/rfc2781
+    \tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t. C++11 may use char16_t instead.
+    \note implements Encoding concept
+
+    \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness.
+    For streaming, use UTF16LE and UTF16BE, which handle endianness.
+*/
+template<typename CharType = wchar_t>
+struct UTF16 {
+    typedef CharType Ch;
+    RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 2);
+
+    enum { supportUnicode = 1 };
+
+    template<typename OutputStream>
+    static void Encode(OutputStream& os, unsigned codepoint) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
+        if (codepoint <= 0xFFFF) {
+            RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair 
+            os.Put(static_cast<typename OutputStream::Ch>(codepoint));
+        }
+        else {
+            RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
+            unsigned v = codepoint - 0x10000;
+            os.Put(static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800));
+            os.Put(static_cast<typename OutputStream::Ch>((v & 0x3FF) | 0xDC00));
+        }
+    }
+
+
+    template<typename OutputStream>
+    static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
+        if (codepoint <= 0xFFFF) {
+            RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair 
+            PutUnsafe(os, static_cast<typename OutputStream::Ch>(codepoint));
+        }
+        else {
+            RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
+            unsigned v = codepoint - 0x10000;
+            PutUnsafe(os, static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800));
+            PutUnsafe(os, static_cast<typename OutputStream::Ch>((v & 0x3FF) | 0xDC00));
+        }
+    }
+
+    template <typename InputStream>
+    static bool Decode(InputStream& is, unsigned* codepoint) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2);
+        typename InputStream::Ch c = is.Take();
+        if (c < 0xD800 || c > 0xDFFF) {
+            *codepoint = static_cast<unsigned>(c);
+            return true;
+        }
+        else if (c <= 0xDBFF) {
+            *codepoint = (static_cast<unsigned>(c) & 0x3FF) << 10;
+            c = is.Take();
+            *codepoint |= (static_cast<unsigned>(c) & 0x3FF);
+            *codepoint += 0x10000;
+            return c >= 0xDC00 && c <= 0xDFFF;
+        }
+        return false;
+    }
+
+    template <typename InputStream, typename OutputStream>
+    static bool Validate(InputStream& is, OutputStream& os) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2);
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
+        typename InputStream::Ch c;
+        os.Put(static_cast<typename OutputStream::Ch>(c = is.Take()));
+        if (c < 0xD800 || c > 0xDFFF)
+            return true;
+        else if (c <= 0xDBFF) {
+            os.Put(c = is.Take());
+            return c >= 0xDC00 && c <= 0xDFFF;
+        }
+        return false;
+    }
+};
+
+//! UTF-16 little endian encoding.
+template<typename CharType = wchar_t>
+struct UTF16LE : UTF16<CharType> {
+    template <typename InputByteStream>
+    static CharType TakeBOM(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        CharType c = Take(is);
+        return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c;
+    }
+
+    template <typename InputByteStream>
+    static CharType Take(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        unsigned c = static_cast<uint8_t>(is.Take());
+        c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
+        return static_cast<CharType>(c);
+    }
+
+    template <typename OutputByteStream>
+    static void PutBOM(OutputByteStream& os) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
+    }
+
+    template <typename OutputByteStream>
+    static void Put(OutputByteStream& os, CharType c) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) & 0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>((static_cast<unsigned>(c) >> 8) & 0xFFu));
+    }
+};
+
+//! UTF-16 big endian encoding.
+template<typename CharType = wchar_t>
+struct UTF16BE : UTF16<CharType> {
+    template <typename InputByteStream>
+    static CharType TakeBOM(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        CharType c = Take(is);
+        return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c;
+    }
+
+    template <typename InputByteStream>
+    static CharType Take(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
+        c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take()));
+        return static_cast<CharType>(c);
+    }
+
+    template <typename OutputByteStream>
+    static void PutBOM(OutputByteStream& os) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
+    }
+
+    template <typename OutputByteStream>
+    static void Put(OutputByteStream& os, CharType c) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>((static_cast<unsigned>(c) >> 8) & 0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) & 0xFFu));
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// UTF32
+
+//! UTF-32 encoding. 
+/*! http://en.wikipedia.org/wiki/UTF-32
+    \tparam CharType Type for storing 32-bit UTF-32 data. Default is unsigned. C++11 may use char32_t instead.
+    \note implements Encoding concept
+
+    \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness.
+    For streaming, use UTF32LE and UTF32BE, which handle endianness.
+*/
+template<typename CharType = unsigned>
+struct UTF32 {
+    typedef CharType Ch;
+    RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 4);
+
+    enum { supportUnicode = 1 };
+
+    template<typename OutputStream>
+    static void Encode(OutputStream& os, unsigned codepoint) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4);
+        RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
+        os.Put(codepoint);
+    }
+
+    template<typename OutputStream>
+    static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4);
+        RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
+        PutUnsafe(os, codepoint);
+    }
+
+    template <typename InputStream>
+    static bool Decode(InputStream& is, unsigned* codepoint) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4);
+        Ch c = is.Take();
+        *codepoint = c;
+        return c <= 0x10FFFF;
+    }
+
+    template <typename InputStream, typename OutputStream>
+    static bool Validate(InputStream& is, OutputStream& os) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4);
+        Ch c;
+        os.Put(c = is.Take());
+        return c <= 0x10FFFF;
+    }
+};
+
+//! UTF-32 little endian enocoding.
+template<typename CharType = unsigned>
+struct UTF32LE : UTF32<CharType> {
+    template <typename InputByteStream>
+    static CharType TakeBOM(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        CharType c = Take(is);
+        return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c;
+    }
+
+    template <typename InputByteStream>
+    static CharType Take(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        unsigned c = static_cast<uint8_t>(is.Take());
+        c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
+        c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16;
+        c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24;
+        return static_cast<CharType>(c);
+    }
+
+    template <typename OutputByteStream>
+    static void PutBOM(OutputByteStream& os) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
+    }
+
+    template <typename OutputByteStream>
+    static void Put(OutputByteStream& os, CharType c) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu));
+    }
+};
+
+//! UTF-32 big endian encoding.
+template<typename CharType = unsigned>
+struct UTF32BE : UTF32<CharType> {
+    template <typename InputByteStream>
+    static CharType TakeBOM(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        CharType c = Take(is);
+        return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c; 
+    }
+
+    template <typename InputByteStream>
+    static CharType Take(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24;
+        c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16;
+        c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
+        c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take()));
+        return static_cast<CharType>(c);
+    }
+
+    template <typename OutputByteStream>
+    static void PutBOM(OutputByteStream& os) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
+        os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
+    }
+
+    template <typename OutputByteStream>
+    static void Put(OutputByteStream& os, CharType c) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu));
+        os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu));
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// ASCII
+
+//! ASCII encoding.
+/*! http://en.wikipedia.org/wiki/ASCII
+    \tparam CharType Code unit for storing 7-bit ASCII data. Default is char.
+    \note implements Encoding concept
+*/
+template<typename CharType = char>
+struct ASCII {
+    typedef CharType Ch;
+
+    enum { supportUnicode = 0 };
+
+    template<typename OutputStream>
+    static void Encode(OutputStream& os, unsigned codepoint) {
+        RAPIDJSON_ASSERT(codepoint <= 0x7F);
+        os.Put(static_cast<Ch>(codepoint & 0xFF));
+    }
+
+    template<typename OutputStream>
+    static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
+        RAPIDJSON_ASSERT(codepoint <= 0x7F);
+        PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF));
+    }
+
+    template <typename InputStream>
+    static bool Decode(InputStream& is, unsigned* codepoint) {
+        uint8_t c = static_cast<uint8_t>(is.Take());
+        *codepoint = c;
+        return c <= 0X7F;
+    }
+
+    template <typename InputStream, typename OutputStream>
+    static bool Validate(InputStream& is, OutputStream& os) {
+        uint8_t c = static_cast<uint8_t>(is.Take());
+        os.Put(static_cast<typename OutputStream::Ch>(c));
+        return c <= 0x7F;
+    }
+
+    template <typename InputByteStream>
+    static CharType TakeBOM(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        uint8_t c = static_cast<uint8_t>(Take(is));
+        return static_cast<Ch>(c);
+    }
+
+    template <typename InputByteStream>
+    static Ch Take(InputByteStream& is) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
+        return static_cast<Ch>(is.Take());
+    }
+
+    template <typename OutputByteStream>
+    static void PutBOM(OutputByteStream& os) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        (void)os;
+    }
+
+    template <typename OutputByteStream>
+    static void Put(OutputByteStream& os, Ch c) {
+        RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
+        os.Put(static_cast<typename OutputByteStream::Ch>(c));
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// AutoUTF
+
+//! Runtime-specified UTF encoding type of a stream.
+enum UTFType {
+    kUTF8 = 0,      //!< UTF-8.
+    kUTF16LE = 1,   //!< UTF-16 little endian.
+    kUTF16BE = 2,   //!< UTF-16 big endian.
+    kUTF32LE = 3,   //!< UTF-32 little endian.
+    kUTF32BE = 4    //!< UTF-32 big endian.
+};
+
+//! Dynamically select encoding according to stream's runtime-specified UTF encoding type.
+/*! \note This class can be used with AutoUTFInputtStream and AutoUTFOutputStream, which provides GetType().
+*/
+template<typename CharType>
+struct AutoUTF {
+    typedef CharType Ch;
+
+    enum { supportUnicode = 1 };
+
+#define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
+
+    template<typename OutputStream>
+    static RAPIDJSON_FORCEINLINE void Encode(OutputStream& os, unsigned codepoint) {
+        typedef void (*EncodeFunc)(OutputStream&, unsigned);
+        static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Encode) };
+        (*f[os.GetType()])(os, codepoint);
+    }
+
+    template<typename OutputStream>
+    static RAPIDJSON_FORCEINLINE void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
+        typedef void (*EncodeFunc)(OutputStream&, unsigned);
+        static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(EncodeUnsafe) };
+        (*f[os.GetType()])(os, codepoint);
+    }
+
+    template <typename InputStream>
+    static RAPIDJSON_FORCEINLINE bool Decode(InputStream& is, unsigned* codepoint) {
+        typedef bool (*DecodeFunc)(InputStream&, unsigned*);
+        static const DecodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Decode) };
+        return (*f[is.GetType()])(is, codepoint);
+    }
+
+    template <typename InputStream, typename OutputStream>
+    static RAPIDJSON_FORCEINLINE bool Validate(InputStream& is, OutputStream& os) {
+        typedef bool (*ValidateFunc)(InputStream&, OutputStream&);
+        static const ValidateFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Validate) };
+        return (*f[is.GetType()])(is, os);
+    }
+
+#undef RAPIDJSON_ENCODINGS_FUNC
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Transcoder
+
+//! Encoding conversion.
+template<typename SourceEncoding, typename TargetEncoding>
+struct Transcoder {
+    //! Take one Unicode codepoint from source encoding, convert it to target encoding and put it to the output stream.
+    template<typename InputStream, typename OutputStream>
+    static RAPIDJSON_FORCEINLINE bool Transcode(InputStream& is, OutputStream& os) {
+        unsigned codepoint;
+        if (!SourceEncoding::Decode(is, &codepoint))
+            return false;
+        TargetEncoding::Encode(os, codepoint);
+        return true;
+    }
+
+    template<typename InputStream, typename OutputStream>
+    static RAPIDJSON_FORCEINLINE bool TranscodeUnsafe(InputStream& is, OutputStream& os) {
+        unsigned codepoint;
+        if (!SourceEncoding::Decode(is, &codepoint))
+            return false;
+        TargetEncoding::EncodeUnsafe(os, codepoint);
+        return true;
+    }
+
+    //! Validate one Unicode codepoint from an encoded stream.
+    template<typename InputStream, typename OutputStream>
+    static RAPIDJSON_FORCEINLINE bool Validate(InputStream& is, OutputStream& os) {
+        return Transcode(is, os);   // Since source/target encoding is different, must transcode.
+    }
+};
+
+// Forward declaration.
+template<typename Stream>
+inline void PutUnsafe(Stream& stream, typename Stream::Ch c);
+
+//! Specialization of Transcoder with same source and target encoding.
+template<typename Encoding>
+struct Transcoder<Encoding, Encoding> {
+    template<typename InputStream, typename OutputStream>
+    static RAPIDJSON_FORCEINLINE bool Transcode(InputStream& is, OutputStream& os) {
+        os.Put(is.Take());  // Just copy one code unit. This semantic is different from primary template class.
+        return true;
+    }
+    
+    template<typename InputStream, typename OutputStream>
+    static RAPIDJSON_FORCEINLINE bool TranscodeUnsafe(InputStream& is, OutputStream& os) {
+        PutUnsafe(os, is.Take());  // Just copy one code unit. This semantic is different from primary template class.
+        return true;
+    }
+    
+    template<typename InputStream, typename OutputStream>
+    static RAPIDJSON_FORCEINLINE bool Validate(InputStream& is, OutputStream& os) {
+        return Encoding::Validate(is, os);  // source/target encoding are the same
+    }
+};
+
+RAPIDJSON_NAMESPACE_END
+
+#if defined(__GNUC__) || (defined(_MSC_VER) && !defined(__clang__))
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_ENCODINGS_H_
diff --git a/include/rapidjson/error/en.h b/include/rapidjson/error/en.h
new file mode 100644
index 0000000000..c87b04eb13
--- /dev/null
+++ b/include/rapidjson/error/en.h
@@ -0,0 +1,176 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_ERROR_EN_H_
+#define RAPIDJSON_ERROR_EN_H_
+
+#include "error.h"
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(switch-enum)
+RAPIDJSON_DIAG_OFF(covered-switch-default)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Maps error code of parsing into error message.
+/*!
+    \ingroup RAPIDJSON_ERRORS
+    \param parseErrorCode Error code obtained in parsing.
+    \return the error message.
+    \note User can make a copy of this function for localization.
+        Using switch-case is safer for future modification of error codes.
+*/
+inline const RAPIDJSON_ERROR_CHARTYPE* GetParseError_En(ParseErrorCode parseErrorCode) {
+    switch (parseErrorCode) {
+        case kParseErrorNone:                           return RAPIDJSON_ERROR_STRING("No error.");
+
+        case kParseErrorDocumentEmpty:                  return RAPIDJSON_ERROR_STRING("The document is empty.");
+        case kParseErrorDocumentRootNotSingular:        return RAPIDJSON_ERROR_STRING("The document root must not be followed by other values.");
+
+        case kParseErrorValueInvalid:                   return RAPIDJSON_ERROR_STRING("Invalid value.");
+
+        case kParseErrorObjectMissName:                 return RAPIDJSON_ERROR_STRING("Missing a name for object member.");
+        case kParseErrorObjectMissColon:                return RAPIDJSON_ERROR_STRING("Missing a colon after a name of object member.");
+        case kParseErrorObjectMissCommaOrCurlyBracket:  return RAPIDJSON_ERROR_STRING("Missing a comma or '}' after an object member.");
+
+        case kParseErrorArrayMissCommaOrSquareBracket:  return RAPIDJSON_ERROR_STRING("Missing a comma or ']' after an array element.");
+
+        case kParseErrorStringUnicodeEscapeInvalidHex:  return RAPIDJSON_ERROR_STRING("Incorrect hex digit after \\u escape in string.");
+        case kParseErrorStringUnicodeSurrogateInvalid:  return RAPIDJSON_ERROR_STRING("The surrogate pair in string is invalid.");
+        case kParseErrorStringEscapeInvalid:            return RAPIDJSON_ERROR_STRING("Invalid escape character in string.");
+        case kParseErrorStringMissQuotationMark:        return RAPIDJSON_ERROR_STRING("Missing a closing quotation mark in string.");
+        case kParseErrorStringInvalidEncoding:          return RAPIDJSON_ERROR_STRING("Invalid encoding in string.");
+
+        case kParseErrorNumberTooBig:                   return RAPIDJSON_ERROR_STRING("Number too big to be stored in double.");
+        case kParseErrorNumberMissFraction:             return RAPIDJSON_ERROR_STRING("Miss fraction part in number.");
+        case kParseErrorNumberMissExponent:             return RAPIDJSON_ERROR_STRING("Miss exponent in number.");
+
+        case kParseErrorTermination:                    return RAPIDJSON_ERROR_STRING("Terminate parsing due to Handler error.");
+        case kParseErrorUnspecificSyntaxError:          return RAPIDJSON_ERROR_STRING("Unspecific syntax error.");
+
+        default:                                        return RAPIDJSON_ERROR_STRING("Unknown error.");
+    }
+}
+
+//! Maps error code of validation into error message.
+/*!
+    \ingroup RAPIDJSON_ERRORS
+    \param validateErrorCode Error code obtained from validator.
+    \return the error message.
+    \note User can make a copy of this function for localization.
+        Using switch-case is safer for future modification of error codes.
+*/
+inline const RAPIDJSON_ERROR_CHARTYPE* GetValidateError_En(ValidateErrorCode validateErrorCode) {
+    switch (validateErrorCode) {
+        case kValidateErrors:                           return RAPIDJSON_ERROR_STRING("One or more validation errors have occurred");
+        case kValidateErrorNone:                        return RAPIDJSON_ERROR_STRING("No error.");
+
+        case kValidateErrorMultipleOf:                  return RAPIDJSON_ERROR_STRING("Number '%actual' is not a multiple of the 'multipleOf' value '%expected'.");
+        case kValidateErrorMaximum:                     return RAPIDJSON_ERROR_STRING("Number '%actual' is greater than the 'maximum' value '%expected'.");
+        case kValidateErrorExclusiveMaximum:            return RAPIDJSON_ERROR_STRING("Number '%actual' is greater than or equal to the 'exclusiveMaximum' value '%expected'.");
+        case kValidateErrorMinimum:                     return RAPIDJSON_ERROR_STRING("Number '%actual' is less than the 'minimum' value '%expected'.");
+        case kValidateErrorExclusiveMinimum:            return RAPIDJSON_ERROR_STRING("Number '%actual' is less than or equal to the 'exclusiveMinimum' value '%expected'.");
+
+        case kValidateErrorMaxLength:                   return RAPIDJSON_ERROR_STRING("String '%actual' is longer than the 'maxLength' value '%expected'.");
+        case kValidateErrorMinLength:                   return RAPIDJSON_ERROR_STRING("String '%actual' is shorter than the 'minLength' value '%expected'.");
+        case kValidateErrorPattern:                     return RAPIDJSON_ERROR_STRING("String '%actual' does not match the 'pattern' regular expression.");
+
+        case kValidateErrorMaxItems:                    return RAPIDJSON_ERROR_STRING("Array of length '%actual' is longer than the 'maxItems' value '%expected'.");
+        case kValidateErrorMinItems:                    return RAPIDJSON_ERROR_STRING("Array of length '%actual' is shorter than the 'minItems' value '%expected'.");
+        case kValidateErrorUniqueItems:                 return RAPIDJSON_ERROR_STRING("Array has duplicate items at indices '%duplicates' but 'uniqueItems' is true.");
+        case kValidateErrorAdditionalItems:             return RAPIDJSON_ERROR_STRING("Array has an additional item at index '%disallowed' that is not allowed by the schema.");
+
+        case kValidateErrorMaxProperties:               return RAPIDJSON_ERROR_STRING("Object has '%actual' members which is more than 'maxProperties' value '%expected'.");
+        case kValidateErrorMinProperties:               return RAPIDJSON_ERROR_STRING("Object has '%actual' members which is less than 'minProperties' value '%expected'.");
+        case kValidateErrorRequired:                    return RAPIDJSON_ERROR_STRING("Object is missing the following members required by the schema: '%missing'.");
+        case kValidateErrorAdditionalProperties:        return RAPIDJSON_ERROR_STRING("Object has an additional member '%disallowed' that is not allowed by the schema.");
+        case kValidateErrorPatternProperties:           return RAPIDJSON_ERROR_STRING("Object has 'patternProperties' that are not allowed by the schema.");
+        case kValidateErrorDependencies:                return RAPIDJSON_ERROR_STRING("Object has missing property or schema dependencies, refer to following errors.");
+
+        case kValidateErrorEnum:                        return RAPIDJSON_ERROR_STRING("Property has a value that is not one of its allowed enumerated values.");
+        case kValidateErrorType:                        return RAPIDJSON_ERROR_STRING("Property has a type '%actual' that is not in the following list: '%expected'.");
+
+        case kValidateErrorOneOf:                       return RAPIDJSON_ERROR_STRING("Property did not match any of the sub-schemas specified by 'oneOf', refer to following errors.");
+        case kValidateErrorOneOfMatch:                  return RAPIDJSON_ERROR_STRING("Property matched more than one of the sub-schemas specified by 'oneOf', indices '%matches'.");
+        case kValidateErrorAllOf:                       return RAPIDJSON_ERROR_STRING("Property did not match all of the sub-schemas specified by 'allOf', refer to following errors.");
+        case kValidateErrorAnyOf:                       return RAPIDJSON_ERROR_STRING("Property did not match any of the sub-schemas specified by 'anyOf', refer to following errors.");
+        case kValidateErrorNot:                         return RAPIDJSON_ERROR_STRING("Property matched the sub-schema specified by 'not'.");
+
+        case kValidateErrorReadOnly:                    return RAPIDJSON_ERROR_STRING("Property is read-only but has been provided when validation is for writing.");
+        case kValidateErrorWriteOnly:                   return RAPIDJSON_ERROR_STRING("Property is write-only but has been provided when validation is for reading.");
+
+        default:                                        return RAPIDJSON_ERROR_STRING("Unknown error.");
+    }
+}
+
+//! Maps error code of schema document compilation into error message.
+/*!
+    \ingroup RAPIDJSON_ERRORS
+    \param schemaErrorCode Error code obtained from compiling the schema document.
+    \return the error message.
+    \note User can make a copy of this function for localization.
+        Using switch-case is safer for future modification of error codes.
+*/
+  inline const RAPIDJSON_ERROR_CHARTYPE* GetSchemaError_En(SchemaErrorCode schemaErrorCode) {
+      switch (schemaErrorCode) {
+          case kSchemaErrorNone:                        return RAPIDJSON_ERROR_STRING("No error.");
+
+          case kSchemaErrorStartUnknown:                return RAPIDJSON_ERROR_STRING("Pointer '%value' to start of schema does not resolve to a location in the document.");
+          case kSchemaErrorRefPlainName:                return RAPIDJSON_ERROR_STRING("$ref fragment '%value' must be a JSON pointer.");
+          case kSchemaErrorRefInvalid:                  return RAPIDJSON_ERROR_STRING("$ref must not be an empty string.");
+          case kSchemaErrorRefPointerInvalid:           return RAPIDJSON_ERROR_STRING("$ref fragment '%value' is not a valid JSON pointer at offset '%offset'.");
+          case kSchemaErrorRefUnknown:                  return RAPIDJSON_ERROR_STRING("$ref '%value' does not resolve to a location in the target document.");
+          case kSchemaErrorRefCyclical:                 return RAPIDJSON_ERROR_STRING("$ref '%value' is cyclical.");
+          case kSchemaErrorRefNoRemoteProvider:         return RAPIDJSON_ERROR_STRING("$ref is remote but there is no remote provider.");
+          case kSchemaErrorRefNoRemoteSchema:           return RAPIDJSON_ERROR_STRING("$ref '%value' is remote but the remote provider did not return a schema.");
+          case kSchemaErrorRegexInvalid:                return RAPIDJSON_ERROR_STRING("Invalid regular expression '%value' in 'pattern' or 'patternProperties'.");
+          case kSchemaErrorSpecUnknown:                 return RAPIDJSON_ERROR_STRING("JSON schema draft or OpenAPI version is not recognized.");
+          case kSchemaErrorSpecUnsupported:             return RAPIDJSON_ERROR_STRING("JSON schema draft or OpenAPI version is not supported.");
+          case kSchemaErrorSpecIllegal:                 return RAPIDJSON_ERROR_STRING("Both JSON schema draft and OpenAPI version found in document.");
+          case kSchemaErrorReadOnlyAndWriteOnly:        return RAPIDJSON_ERROR_STRING("Property must not be both 'readOnly' and 'writeOnly'.");
+
+          default:                                      return RAPIDJSON_ERROR_STRING("Unknown error.");
+    }
+  }
+
+//! Maps error code of pointer parse into error message.
+/*!
+    \ingroup RAPIDJSON_ERRORS
+    \param pointerParseErrorCode Error code obtained from pointer parse.
+    \return the error message.
+    \note User can make a copy of this function for localization.
+        Using switch-case is safer for future modification of error codes.
+*/
+inline const RAPIDJSON_ERROR_CHARTYPE* GetPointerParseError_En(PointerParseErrorCode pointerParseErrorCode) {
+    switch (pointerParseErrorCode) {
+        case kPointerParseErrorNone:                       return RAPIDJSON_ERROR_STRING("No error.");
+
+        case kPointerParseErrorTokenMustBeginWithSolidus:  return RAPIDJSON_ERROR_STRING("A token must begin with a '/'.");
+        case kPointerParseErrorInvalidEscape:              return RAPIDJSON_ERROR_STRING("Invalid escape.");
+        case kPointerParseErrorInvalidPercentEncoding:     return RAPIDJSON_ERROR_STRING("Invalid percent encoding in URI fragment.");
+        case kPointerParseErrorCharacterMustPercentEncode: return RAPIDJSON_ERROR_STRING("A character must be percent encoded in a URI fragment.");
+
+        default:                                           return RAPIDJSON_ERROR_STRING("Unknown error.");
+    }
+}
+
+RAPIDJSON_NAMESPACE_END
+
+#ifdef __clang__
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_ERROR_EN_H_
diff --git a/include/rapidjson/error/error.h b/include/rapidjson/error/error.h
new file mode 100644
index 0000000000..cae345db36
--- /dev/null
+++ b/include/rapidjson/error/error.h
@@ -0,0 +1,285 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_ERROR_ERROR_H_
+#define RAPIDJSON_ERROR_ERROR_H_
+
+#include "../rapidjson.h"
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(padded)
+#endif
+
+/*! \file error.h */
+
+/*! \defgroup RAPIDJSON_ERRORS RapidJSON error handling */
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_ERROR_CHARTYPE
+
+//! Character type of error messages.
+/*! \ingroup RAPIDJSON_ERRORS
+    The default character type is \c char.
+    On Windows, user can define this macro as \c TCHAR for supporting both
+    unicode/non-unicode settings.
+*/
+#ifndef RAPIDJSON_ERROR_CHARTYPE
+#define RAPIDJSON_ERROR_CHARTYPE char
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_ERROR_STRING
+
+//! Macro for converting string literal to \ref RAPIDJSON_ERROR_CHARTYPE[].
+/*! \ingroup RAPIDJSON_ERRORS
+    By default this conversion macro does nothing.
+    On Windows, user can define this macro as \c _T(x) for supporting both
+    unicode/non-unicode settings.
+*/
+#ifndef RAPIDJSON_ERROR_STRING
+#define RAPIDJSON_ERROR_STRING(x) x
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+// ParseErrorCode
+
+//! Error code of parsing.
+/*! \ingroup RAPIDJSON_ERRORS
+    \see GenericReader::Parse, GenericReader::GetParseErrorCode
+*/
+enum ParseErrorCode {
+    kParseErrorNone = 0,                        //!< No error.
+
+    kParseErrorDocumentEmpty,                   //!< The document is empty.
+    kParseErrorDocumentRootNotSingular,         //!< The document root must not follow by other values.
+
+    kParseErrorValueInvalid,                    //!< Invalid value.
+
+    kParseErrorObjectMissName,                  //!< Missing a name for object member.
+    kParseErrorObjectMissColon,                 //!< Missing a colon after a name of object member.
+    kParseErrorObjectMissCommaOrCurlyBracket,   //!< Missing a comma or '}' after an object member.
+
+    kParseErrorArrayMissCommaOrSquareBracket,   //!< Missing a comma or ']' after an array element.
+
+    kParseErrorStringUnicodeEscapeInvalidHex,   //!< Incorrect hex digit after \\u escape in string.
+    kParseErrorStringUnicodeSurrogateInvalid,   //!< The surrogate pair in string is invalid.
+    kParseErrorStringEscapeInvalid,             //!< Invalid escape character in string.
+    kParseErrorStringMissQuotationMark,         //!< Missing a closing quotation mark in string.
+    kParseErrorStringInvalidEncoding,           //!< Invalid encoding in string.
+
+    kParseErrorNumberTooBig,                    //!< Number too big to be stored in double.
+    kParseErrorNumberMissFraction,              //!< Miss fraction part in number.
+    kParseErrorNumberMissExponent,              //!< Miss exponent in number.
+
+    kParseErrorTermination,                     //!< Parsing was terminated.
+    kParseErrorUnspecificSyntaxError            //!< Unspecific syntax error.
+};
+
+//! Result of parsing (wraps ParseErrorCode)
+/*!
+    \ingroup RAPIDJSON_ERRORS
+    \code
+        Document doc;
+        ParseResult ok = doc.Parse("[42]");
+        if (!ok) {
+            fprintf(stderr, "JSON parse error: %s (%u)",
+                    GetParseError_En(ok.Code()), ok.Offset());
+            exit(EXIT_FAILURE);
+        }
+    \endcode
+    \see GenericReader::Parse, GenericDocument::Parse
+*/
+struct ParseResult {
+    //!! Unspecified boolean type
+    typedef bool (ParseResult::*BooleanType)() const;
+public:
+    //! Default constructor, no error.
+    ParseResult() : code_(kParseErrorNone), offset_(0) {}
+    //! Constructor to set an error.
+    ParseResult(ParseErrorCode code, size_t offset) : code_(code), offset_(offset) {}
+
+    //! Get the error code.
+    ParseErrorCode Code() const { return code_; }
+    //! Get the error offset, if \ref IsError(), 0 otherwise.
+    size_t Offset() const { return offset_; }
+
+    //! Explicit conversion to \c bool, returns \c true, iff !\ref IsError().
+    operator BooleanType() const { return !IsError() ? &ParseResult::IsError : NULL; }
+    //! Whether the result is an error.
+    bool IsError() const { return code_ != kParseErrorNone; }
+
+    bool operator==(const ParseResult& that) const { return code_ == that.code_; }
+    bool operator==(ParseErrorCode code) const { return code_ == code; }
+    friend bool operator==(ParseErrorCode code, const ParseResult & err) { return code == err.code_; }
+
+    bool operator!=(const ParseResult& that) const { return !(*this == that); }
+    bool operator!=(ParseErrorCode code) const { return !(*this == code); }
+    friend bool operator!=(ParseErrorCode code, const ParseResult & err) { return err != code; }
+
+    //! Reset error code.
+    void Clear() { Set(kParseErrorNone); }
+    //! Update error code and offset.
+    void Set(ParseErrorCode code, size_t offset = 0) { code_ = code; offset_ = offset; }
+
+private:
+    ParseErrorCode code_;
+    size_t offset_;
+};
+
+//! Function pointer type of GetParseError().
+/*! \ingroup RAPIDJSON_ERRORS
+
+    This is the prototype for \c GetParseError_X(), where \c X is a locale.
+    User can dynamically change locale in runtime, e.g.:
+\code
+    GetParseErrorFunc GetParseError = GetParseError_En; // or whatever
+    const RAPIDJSON_ERROR_CHARTYPE* s = GetParseError(document.GetParseErrorCode());
+\endcode
+*/
+typedef const RAPIDJSON_ERROR_CHARTYPE* (*GetParseErrorFunc)(ParseErrorCode);
+
+///////////////////////////////////////////////////////////////////////////////
+// ValidateErrorCode
+
+//! Error codes when validating.
+/*! \ingroup RAPIDJSON_ERRORS
+    \see GenericSchemaValidator
+*/
+enum ValidateErrorCode {
+    kValidateErrors    = -1,                   //!< Top level error code when kValidateContinueOnErrorsFlag set.
+    kValidateErrorNone = 0,                    //!< No error.
+
+    kValidateErrorMultipleOf,                  //!< Number is not a multiple of the 'multipleOf' value.
+    kValidateErrorMaximum,                     //!< Number is greater than the 'maximum' value.
+    kValidateErrorExclusiveMaximum,            //!< Number is greater than or equal to the 'maximum' value.
+    kValidateErrorMinimum,                     //!< Number is less than the 'minimum' value.
+    kValidateErrorExclusiveMinimum,            //!< Number is less than or equal to the 'minimum' value.
+
+    kValidateErrorMaxLength,                   //!< String is longer than the 'maxLength' value.
+    kValidateErrorMinLength,                   //!< String is longer than the 'maxLength' value.
+    kValidateErrorPattern,                     //!< String does not match the 'pattern' regular expression.
+
+    kValidateErrorMaxItems,                    //!< Array is longer than the 'maxItems' value.
+    kValidateErrorMinItems,                    //!< Array is shorter than the 'minItems' value.
+    kValidateErrorUniqueItems,                 //!< Array has duplicate items but 'uniqueItems' is true.
+    kValidateErrorAdditionalItems,             //!< Array has additional items that are not allowed by the schema.
+
+    kValidateErrorMaxProperties,               //!< Object has more members than 'maxProperties' value.
+    kValidateErrorMinProperties,               //!< Object has less members than 'minProperties' value.
+    kValidateErrorRequired,                    //!< Object is missing one or more members required by the schema.
+    kValidateErrorAdditionalProperties,        //!< Object has additional members that are not allowed by the schema.
+    kValidateErrorPatternProperties,           //!< See other errors.
+    kValidateErrorDependencies,                //!< Object has missing property or schema dependencies.
+
+    kValidateErrorEnum,                        //!< Property has a value that is not one of its allowed enumerated values.
+    kValidateErrorType,                        //!< Property has a type that is not allowed by the schema.
+
+    kValidateErrorOneOf,                       //!< Property did not match any of the sub-schemas specified by 'oneOf'.
+    kValidateErrorOneOfMatch,                  //!< Property matched more than one of the sub-schemas specified by 'oneOf'.
+    kValidateErrorAllOf,                       //!< Property did not match all of the sub-schemas specified by 'allOf'.
+    kValidateErrorAnyOf,                       //!< Property did not match any of the sub-schemas specified by 'anyOf'.
+    kValidateErrorNot,                         //!< Property matched the sub-schema specified by 'not'.
+
+    kValidateErrorReadOnly,                    //!< Property is read-only but has been provided when validation is for writing
+    kValidateErrorWriteOnly                    //!< Property is write-only but has been provided when validation is for reading
+};
+
+//! Function pointer type of GetValidateError().
+/*! \ingroup RAPIDJSON_ERRORS
+
+    This is the prototype for \c GetValidateError_X(), where \c X is a locale.
+    User can dynamically change locale in runtime, e.g.:
+\code
+    GetValidateErrorFunc GetValidateError = GetValidateError_En; // or whatever
+    const RAPIDJSON_ERROR_CHARTYPE* s = GetValidateError(validator.GetInvalidSchemaCode());
+\endcode
+*/
+typedef const RAPIDJSON_ERROR_CHARTYPE* (*GetValidateErrorFunc)(ValidateErrorCode);
+
+///////////////////////////////////////////////////////////////////////////////
+// SchemaErrorCode
+
+//! Error codes when validating.
+/*! \ingroup RAPIDJSON_ERRORS
+    \see GenericSchemaValidator
+*/
+enum SchemaErrorCode {
+    kSchemaErrorNone = 0,                      //!< No error.
+
+    kSchemaErrorStartUnknown,                  //!< Pointer to start of schema does not resolve to a location in the document
+    kSchemaErrorRefPlainName,                  //!< $ref fragment must be a JSON pointer
+    kSchemaErrorRefInvalid,                    //!< $ref must not be an empty string
+    kSchemaErrorRefPointerInvalid,             //!< $ref fragment is not a valid JSON pointer at offset
+    kSchemaErrorRefUnknown,                    //!< $ref does not resolve to a location in the target document
+    kSchemaErrorRefCyclical,                   //!< $ref is cyclical
+    kSchemaErrorRefNoRemoteProvider,           //!< $ref is remote but there is no remote provider
+    kSchemaErrorRefNoRemoteSchema,             //!< $ref is remote but the remote provider did not return a schema
+    kSchemaErrorRegexInvalid,                  //!< Invalid regular expression in 'pattern' or 'patternProperties'
+    kSchemaErrorSpecUnknown,                   //!< JSON schema draft or OpenAPI version is not recognized
+    kSchemaErrorSpecUnsupported,               //!< JSON schema draft or OpenAPI version is not supported
+    kSchemaErrorSpecIllegal,                   //!< Both JSON schema draft and OpenAPI version found in document
+    kSchemaErrorReadOnlyAndWriteOnly           //!< Property must not be both 'readOnly' and 'writeOnly'
+};
+
+//! Function pointer type of GetSchemaError().
+/*! \ingroup RAPIDJSON_ERRORS
+
+    This is the prototype for \c GetSchemaError_X(), where \c X is a locale.
+    User can dynamically change locale in runtime, e.g.:
+\code
+    GetSchemaErrorFunc GetSchemaError = GetSchemaError_En; // or whatever
+    const RAPIDJSON_ERROR_CHARTYPE* s = GetSchemaError(validator.GetInvalidSchemaCode());
+\endcode
+*/
+typedef const RAPIDJSON_ERROR_CHARTYPE* (*GetSchemaErrorFunc)(SchemaErrorCode);
+
+///////////////////////////////////////////////////////////////////////////////
+// PointerParseErrorCode
+
+//! Error code of JSON pointer parsing.
+/*! \ingroup RAPIDJSON_ERRORS
+    \see GenericPointer::GenericPointer, GenericPointer::GetParseErrorCode
+*/
+enum PointerParseErrorCode {
+    kPointerParseErrorNone = 0,                     //!< The parse is successful
+
+    kPointerParseErrorTokenMustBeginWithSolidus,    //!< A token must begin with a '/'
+    kPointerParseErrorInvalidEscape,                //!< Invalid escape
+    kPointerParseErrorInvalidPercentEncoding,       //!< Invalid percent encoding in URI fragment
+    kPointerParseErrorCharacterMustPercentEncode    //!< A character must percent encoded in URI fragment
+};
+
+//! Function pointer type of GetPointerParseError().
+/*! \ingroup RAPIDJSON_ERRORS
+
+    This is the prototype for \c GetPointerParseError_X(), where \c X is a locale.
+    User can dynamically change locale in runtime, e.g.:
+\code
+    GetPointerParseErrorFunc GetPointerParseError = GetPointerParseError_En; // or whatever
+    const RAPIDJSON_ERROR_CHARTYPE* s = GetPointerParseError(pointer.GetParseErrorCode());
+\endcode
+*/
+typedef const RAPIDJSON_ERROR_CHARTYPE* (*GetPointerParseErrorFunc)(PointerParseErrorCode);
+
+
+RAPIDJSON_NAMESPACE_END
+
+#ifdef __clang__
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_ERROR_ERROR_H_
diff --git a/include/rapidjson/filereadstream.h b/include/rapidjson/filereadstream.h
new file mode 100644
index 0000000000..f8bb43cb0c
--- /dev/null
+++ b/include/rapidjson/filereadstream.h
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_FILEREADSTREAM_H_
+#define RAPIDJSON_FILEREADSTREAM_H_
+
+#include "stream.h"
+#include <cstdio>
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(padded)
+RAPIDJSON_DIAG_OFF(unreachable-code)
+RAPIDJSON_DIAG_OFF(missing-noreturn)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! File byte stream for input using fread().
+/*!
+    \note implements Stream concept
+*/
+class FileReadStream {
+public:
+    typedef char Ch;    //!< Character type (byte).
+
+    //! Constructor.
+    /*!
+        \param fp File pointer opened for read.
+        \param buffer user-supplied buffer.
+        \param bufferSize size of buffer in bytes. Must >=4 bytes.
+    */
+    FileReadStream(std::FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferSize_(bufferSize), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) { 
+        RAPIDJSON_ASSERT(fp_ != 0);
+        RAPIDJSON_ASSERT(bufferSize >= 4);
+        Read();
+    }
+
+    Ch Peek() const { return *current_; }
+    Ch Take() { Ch c = *current_; Read(); return c; }
+    size_t Tell() const { return count_ + static_cast<size_t>(current_ - buffer_); }
+
+    // Not implemented
+    void Put(Ch) { RAPIDJSON_ASSERT(false); }
+    void Flush() { RAPIDJSON_ASSERT(false); } 
+    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
+
+    // For encoding detection only.
+    const Ch* Peek4() const {
+        return (current_ + 4 - !eof_ <= bufferLast_) ? current_ : 0;
+    }
+
+private:
+    void Read() {
+        if (current_ < bufferLast_)
+            ++current_;
+        else if (!eof_) {
+            count_ += readCount_;
+            readCount_ = std::fread(buffer_, 1, bufferSize_, fp_);
+            bufferLast_ = buffer_ + readCount_ - 1;
+            current_ = buffer_;
+
+            if (readCount_ < bufferSize_) {
+                buffer_[readCount_] = '\0';
+                ++bufferLast_;
+                eof_ = true;
+            }
+        }
+    }
+
+    std::FILE* fp_;
+    Ch *buffer_;
+    size_t bufferSize_;
+    Ch *bufferLast_;
+    Ch *current_;
+    size_t readCount_;
+    size_t count_;  //!< Number of characters read
+    bool eof_;
+};
+
+RAPIDJSON_NAMESPACE_END
+
+#ifdef __clang__
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_FILESTREAM_H_
diff --git a/include/rapidjson/filewritestream.h b/include/rapidjson/filewritestream.h
new file mode 100644
index 0000000000..5d89588c21
--- /dev/null
+++ b/include/rapidjson/filewritestream.h
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_FILEWRITESTREAM_H_
+#define RAPIDJSON_FILEWRITESTREAM_H_
+
+#include "stream.h"
+#include <cstdio>
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(unreachable-code)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Wrapper of C file stream for output using fwrite().
+/*!
+    \note implements Stream concept
+*/
+class FileWriteStream {
+public:
+    typedef char Ch;    //!< Character type. Only support char.
+
+    FileWriteStream(std::FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferEnd_(buffer + bufferSize), current_(buffer_) { 
+        RAPIDJSON_ASSERT(fp_ != 0);
+    }
+
+    void Put(char c) { 
+        if (current_ >= bufferEnd_)
+            Flush();
+
+        *current_++ = c;
+    }
+
+    void PutN(char c, size_t n) {
+        size_t avail = static_cast<size_t>(bufferEnd_ - current_);
+        while (n > avail) {
+            std::memset(current_, c, avail);
+            current_ += avail;
+            Flush();
+            n -= avail;
+            avail = static_cast<size_t>(bufferEnd_ - current_);
+        }
+
+        if (n > 0) {
+            std::memset(current_, c, n);
+            current_ += n;
+        }
+    }
+
+    void Flush() {
+        if (current_ != buffer_) {
+            size_t result = std::fwrite(buffer_, 1, static_cast<size_t>(current_ - buffer_), fp_);
+            if (result < static_cast<size_t>(current_ - buffer_)) {
+                // failure deliberately ignored at this time
+                // added to avoid warn_unused_result build errors
+            }
+            current_ = buffer_;
+        }
+    }
+
+    // Not implemented
+    char Peek() const { RAPIDJSON_ASSERT(false); return 0; }
+    char Take() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
+    char* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t PutEnd(char*) { RAPIDJSON_ASSERT(false); return 0; }
+
+private:
+    // Prohibit copy constructor & assignment operator.
+    FileWriteStream(const FileWriteStream&);
+    FileWriteStream& operator=(const FileWriteStream&);
+
+    std::FILE* fp_;
+    char *buffer_;
+    char *bufferEnd_;
+    char *current_;
+};
+
+//! Implement specialized version of PutN() with memset() for better performance.
+template<>
+inline void PutN(FileWriteStream& stream, char c, size_t n) {
+    stream.PutN(c, n);
+}
+
+RAPIDJSON_NAMESPACE_END
+
+#ifdef __clang__
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_FILESTREAM_H_
diff --git a/include/rapidjson/fwd.h b/include/rapidjson/fwd.h
new file mode 100644
index 0000000000..d62f77f0ec
--- /dev/null
+++ b/include/rapidjson/fwd.h
@@ -0,0 +1,151 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_FWD_H_
+#define RAPIDJSON_FWD_H_
+
+#include "rapidjson.h"
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+// encodings.h
+
+template<typename CharType> struct UTF8;
+template<typename CharType> struct UTF16;
+template<typename CharType> struct UTF16BE;
+template<typename CharType> struct UTF16LE;
+template<typename CharType> struct UTF32;
+template<typename CharType> struct UTF32BE;
+template<typename CharType> struct UTF32LE;
+template<typename CharType> struct ASCII;
+template<typename CharType> struct AutoUTF;
+
+template<typename SourceEncoding, typename TargetEncoding>
+struct Transcoder;
+
+// allocators.h
+
+class CrtAllocator;
+
+template <typename BaseAllocator>
+class MemoryPoolAllocator;
+
+// stream.h
+
+template <typename Encoding>
+struct GenericStringStream;
+
+typedef GenericStringStream<UTF8<char> > StringStream;
+
+template <typename Encoding>
+struct GenericInsituStringStream;
+
+typedef GenericInsituStringStream<UTF8<char> > InsituStringStream;
+
+// stringbuffer.h
+
+template <typename Encoding, typename Allocator>
+class GenericStringBuffer;
+
+typedef GenericStringBuffer<UTF8<char>, CrtAllocator> StringBuffer;
+
+// filereadstream.h
+
+class FileReadStream;
+
+// filewritestream.h
+
+class FileWriteStream;
+
+// memorybuffer.h
+
+template <typename Allocator>
+struct GenericMemoryBuffer;
+
+typedef GenericMemoryBuffer<CrtAllocator> MemoryBuffer;
+
+// memorystream.h
+
+struct MemoryStream;
+
+// reader.h
+
+template<typename Encoding, typename Derived>
+struct BaseReaderHandler;
+
+template <typename SourceEncoding, typename TargetEncoding, typename StackAllocator>
+class GenericReader;
+
+typedef GenericReader<UTF8<char>, UTF8<char>, CrtAllocator> Reader;
+
+// writer.h
+
+template<typename OutputStream, typename SourceEncoding, typename TargetEncoding, typename StackAllocator, unsigned writeFlags>
+class Writer;
+
+// prettywriter.h
+
+template<typename OutputStream, typename SourceEncoding, typename TargetEncoding, typename StackAllocator, unsigned writeFlags>
+class PrettyWriter;
+
+// document.h
+
+template <typename Encoding, typename Allocator> 
+class GenericMember;
+
+template <bool Const, typename Encoding, typename Allocator>
+class GenericMemberIterator;
+
+template<typename CharType>
+struct GenericStringRef;
+
+template <typename Encoding, typename Allocator> 
+class GenericValue;
+
+typedef GenericValue<UTF8<char>, MemoryPoolAllocator<CrtAllocator> > Value;
+
+template <typename Encoding, typename Allocator, typename StackAllocator>
+class GenericDocument;
+
+typedef GenericDocument<UTF8<char>, MemoryPoolAllocator<CrtAllocator>, CrtAllocator> Document;
+
+// pointer.h
+
+template <typename ValueType, typename Allocator>
+class GenericPointer;
+
+typedef GenericPointer<Value, CrtAllocator> Pointer;
+
+// schema.h
+
+template <typename SchemaDocumentType>
+class IGenericRemoteSchemaDocumentProvider;
+
+template <typename ValueT, typename Allocator>
+class GenericSchemaDocument;
+
+typedef GenericSchemaDocument<Value, CrtAllocator> SchemaDocument;
+typedef IGenericRemoteSchemaDocumentProvider<SchemaDocument> IRemoteSchemaDocumentProvider;
+
+template <
+    typename SchemaDocumentType,
+    typename OutputHandler,
+    typename StateAllocator>
+class GenericSchemaValidator;
+
+typedef GenericSchemaValidator<SchemaDocument, BaseReaderHandler<UTF8<char>, void>, CrtAllocator> SchemaValidator;
+
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_RAPIDJSONFWD_H_
diff --git a/include/rapidjson/internal/biginteger.h b/include/rapidjson/internal/biginteger.h
new file mode 100644
index 0000000000..4930043dc7
--- /dev/null
+++ b/include/rapidjson/internal/biginteger.h
@@ -0,0 +1,297 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_BIGINTEGER_H_
+#define RAPIDJSON_BIGINTEGER_H_
+
+#include "../rapidjson.h"
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && defined(_M_AMD64)
+#include <intrin.h> // for _umul128
+#if !defined(_ARM64EC_)
+#pragma intrinsic(_umul128)
+#else
+#pragma comment(lib,"softintrin")
+#endif
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+class BigInteger {
+public:
+    typedef uint64_t Type;
+
+    BigInteger(const BigInteger& rhs) : count_(rhs.count_) {
+        std::memcpy(digits_, rhs.digits_, count_ * sizeof(Type));
+    }
+
+    explicit BigInteger(uint64_t u) : count_(1) {
+        digits_[0] = u;
+    }
+
+    template<typename Ch>
+    BigInteger(const Ch* decimals, size_t length) : count_(1) {
+        RAPIDJSON_ASSERT(length > 0);
+        digits_[0] = 0;
+        size_t i = 0;
+        const size_t kMaxDigitPerIteration = 19;  // 2^64 = 18446744073709551616 > 10^19
+        while (length >= kMaxDigitPerIteration) {
+            AppendDecimal64(decimals + i, decimals + i + kMaxDigitPerIteration);
+            length -= kMaxDigitPerIteration;
+            i += kMaxDigitPerIteration;
+        }
+
+        if (length > 0)
+            AppendDecimal64(decimals + i, decimals + i + length);
+    }
+    
+    BigInteger& operator=(const BigInteger &rhs)
+    {
+        if (this != &rhs) {
+            count_ = rhs.count_;
+            std::memcpy(digits_, rhs.digits_, count_ * sizeof(Type));
+        }
+        return *this;
+    }
+    
+    BigInteger& operator=(uint64_t u) {
+        digits_[0] = u;            
+        count_ = 1;
+        return *this;
+    }
+
+    BigInteger& operator+=(uint64_t u) {
+        Type backup = digits_[0];
+        digits_[0] += u;
+        for (size_t i = 0; i < count_ - 1; i++) {
+            if (digits_[i] >= backup)
+                return *this; // no carry
+            backup = digits_[i + 1];
+            digits_[i + 1] += 1;
+        }
+
+        // Last carry
+        if (digits_[count_ - 1] < backup)
+            PushBack(1);
+
+        return *this;
+    }
+
+    BigInteger& operator*=(uint64_t u) {
+        if (u == 0) return *this = 0;
+        if (u == 1) return *this;
+        if (*this == 1) return *this = u;
+
+        uint64_t k = 0;
+        for (size_t i = 0; i < count_; i++) {
+            uint64_t hi;
+            digits_[i] = MulAdd64(digits_[i], u, k, &hi);
+            k = hi;
+        }
+        
+        if (k > 0)
+            PushBack(k);
+
+        return *this;
+    }
+
+    BigInteger& operator*=(uint32_t u) {
+        if (u == 0) return *this = 0;
+        if (u == 1) return *this;
+        if (*this == 1) return *this = u;
+
+        uint64_t k = 0;
+        for (size_t i = 0; i < count_; i++) {
+            const uint64_t c = digits_[i] >> 32;
+            const uint64_t d = digits_[i] & 0xFFFFFFFF;
+            const uint64_t uc = u * c;
+            const uint64_t ud = u * d;
+            const uint64_t p0 = ud + k;
+            const uint64_t p1 = uc + (p0 >> 32);
+            digits_[i] = (p0 & 0xFFFFFFFF) | (p1 << 32);
+            k = p1 >> 32;
+        }
+        
+        if (k > 0)
+            PushBack(k);
+
+        return *this;
+    }
+
+    BigInteger& operator<<=(size_t shift) {
+        if (IsZero() || shift == 0) return *this;
+
+        size_t offset = shift / kTypeBit;
+        size_t interShift = shift % kTypeBit;
+        RAPIDJSON_ASSERT(count_ + offset <= kCapacity);
+
+        if (interShift == 0) {
+            std::memmove(digits_ + offset, digits_, count_ * sizeof(Type));
+            count_ += offset;
+        }
+        else {
+            digits_[count_] = 0;
+            for (size_t i = count_; i > 0; i--)
+                digits_[i + offset] = (digits_[i] << interShift) | (digits_[i - 1] >> (kTypeBit - interShift));
+            digits_[offset] = digits_[0] << interShift;
+            count_ += offset;
+            if (digits_[count_])
+                count_++;
+        }
+
+        std::memset(digits_, 0, offset * sizeof(Type));
+
+        return *this;
+    }
+
+    bool operator==(const BigInteger& rhs) const {
+        return count_ == rhs.count_ && std::memcmp(digits_, rhs.digits_, count_ * sizeof(Type)) == 0;
+    }
+
+    bool operator==(const Type rhs) const {
+        return count_ == 1 && digits_[0] == rhs;
+    }
+
+    BigInteger& MultiplyPow5(unsigned exp) {
+        static const uint32_t kPow5[12] = {
+            5,
+            5 * 5,
+            5 * 5 * 5,
+            5 * 5 * 5 * 5,
+            5 * 5 * 5 * 5 * 5,
+            5 * 5 * 5 * 5 * 5 * 5,
+            5 * 5 * 5 * 5 * 5 * 5 * 5,
+            5 * 5 * 5 * 5 * 5 * 5 * 5 * 5,
+            5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5,
+            5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5,
+            5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5,
+            5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5
+        };
+        if (exp == 0) return *this;
+        for (; exp >= 27; exp -= 27) *this *= RAPIDJSON_UINT64_C2(0X6765C793, 0XFA10079D); // 5^27
+        for (; exp >= 13; exp -= 13) *this *= static_cast<uint32_t>(1220703125u); // 5^13
+        if (exp > 0)                 *this *= kPow5[exp - 1];
+        return *this;
+    }
+
+    // Compute absolute difference of this and rhs.
+    // Assume this != rhs
+    bool Difference(const BigInteger& rhs, BigInteger* out) const {
+        int cmp = Compare(rhs);
+        RAPIDJSON_ASSERT(cmp != 0);
+        const BigInteger *a, *b;  // Makes a > b
+        bool ret;
+        if (cmp < 0) { a = &rhs; b = this; ret = true; }
+        else         { a = this; b = &rhs; ret = false; }
+
+        Type borrow = 0;
+        for (size_t i = 0; i < a->count_; i++) {
+            Type d = a->digits_[i] - borrow;
+            if (i < b->count_)
+                d -= b->digits_[i];
+            borrow = (d > a->digits_[i]) ? 1 : 0;
+            out->digits_[i] = d;
+            if (d != 0)
+                out->count_ = i + 1;
+        }
+
+        return ret;
+    }
+
+    int Compare(const BigInteger& rhs) const {
+        if (count_ != rhs.count_)
+            return count_ < rhs.count_ ? -1 : 1;
+
+        for (size_t i = count_; i-- > 0;)
+            if (digits_[i] != rhs.digits_[i])
+                return digits_[i] < rhs.digits_[i] ? -1 : 1;
+
+        return 0;
+    }
+
+    size_t GetCount() const { return count_; }
+    Type GetDigit(size_t index) const { RAPIDJSON_ASSERT(index < count_); return digits_[index]; }
+    bool IsZero() const { return count_ == 1 && digits_[0] == 0; }
+
+private:
+    template<typename Ch>
+    void AppendDecimal64(const Ch* begin, const Ch* end) {
+        uint64_t u = ParseUint64(begin, end);
+        if (IsZero())
+            *this = u;
+        else {
+            unsigned exp = static_cast<unsigned>(end - begin);
+            (MultiplyPow5(exp) <<= exp) += u;   // *this = *this * 10^exp + u
+        }
+    }
+
+    void PushBack(Type digit) {
+        RAPIDJSON_ASSERT(count_ < kCapacity);
+        digits_[count_++] = digit;
+    }
+
+    template<typename Ch>
+    static uint64_t ParseUint64(const Ch* begin, const Ch* end) {
+        uint64_t r = 0;
+        for (const Ch* p = begin; p != end; ++p) {
+            RAPIDJSON_ASSERT(*p >= Ch('0') && *p <= Ch('9'));
+            r = r * 10u + static_cast<unsigned>(*p - Ch('0'));
+        }
+        return r;
+    }
+
+    // Assume a * b + k < 2^128
+    static uint64_t MulAdd64(uint64_t a, uint64_t b, uint64_t k, uint64_t* outHigh) {
+#if defined(_MSC_VER) && defined(_M_AMD64)
+        uint64_t low = _umul128(a, b, outHigh) + k;
+        if (low < k)
+            (*outHigh)++;
+        return low;
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) && defined(__x86_64__)
+        __extension__ typedef unsigned __int128 uint128;
+        uint128 p = static_cast<uint128>(a) * static_cast<uint128>(b);
+        p += k;
+        *outHigh = static_cast<uint64_t>(p >> 64);
+        return static_cast<uint64_t>(p);
+#else
+        const uint64_t a0 = a & 0xFFFFFFFF, a1 = a >> 32, b0 = b & 0xFFFFFFFF, b1 = b >> 32;
+        uint64_t x0 = a0 * b0, x1 = a0 * b1, x2 = a1 * b0, x3 = a1 * b1;
+        x1 += (x0 >> 32); // can't give carry
+        x1 += x2;
+        if (x1 < x2)
+            x3 += (static_cast<uint64_t>(1) << 32);
+        uint64_t lo = (x1 << 32) + (x0 & 0xFFFFFFFF);
+        uint64_t hi = x3 + (x1 >> 32);
+
+        lo += k;
+        if (lo < k)
+            hi++;
+        *outHigh = hi;
+        return lo;
+#endif
+    }
+
+    static const size_t kBitCount = 3328;  // 64bit * 54 > 10^1000
+    static const size_t kCapacity = kBitCount / sizeof(Type);
+    static const size_t kTypeBit = sizeof(Type) * 8;
+
+    Type digits_[kCapacity];
+    size_t count_;
+};
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_BIGINTEGER_H_
diff --git a/include/rapidjson/internal/clzll.h b/include/rapidjson/internal/clzll.h
new file mode 100644
index 0000000000..8fc5118aa4
--- /dev/null
+++ b/include/rapidjson/internal/clzll.h
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_CLZLL_H_
+#define RAPIDJSON_CLZLL_H_
+
+#include "../rapidjson.h"
+
+#if defined(_MSC_VER) && !defined(UNDER_CE)
+#include <intrin.h>
+#if defined(_WIN64)
+#pragma intrinsic(_BitScanReverse64)
+#else
+#pragma intrinsic(_BitScanReverse)
+#endif
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+inline uint32_t clzll(uint64_t x) {
+    // Passing 0 to __builtin_clzll is UB in GCC and results in an
+    // infinite loop in the software implementation.
+    RAPIDJSON_ASSERT(x != 0);
+
+#if defined(_MSC_VER) && !defined(UNDER_CE)
+    unsigned long r = 0;
+#if defined(_WIN64)
+    _BitScanReverse64(&r, x);
+#else
+    // Scan the high 32 bits.
+    if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32)))
+        return 63 - (r + 32);
+
+    // Scan the low 32 bits.
+    _BitScanReverse(&r, static_cast<uint32_t>(x & 0xFFFFFFFF));
+#endif // _WIN64
+
+    return 63 - r;
+#elif (defined(__GNUC__) && __GNUC__ >= 4) || RAPIDJSON_HAS_BUILTIN(__builtin_clzll)
+    // __builtin_clzll wrapper
+    return static_cast<uint32_t>(__builtin_clzll(x));
+#else
+    // naive version
+    uint32_t r = 0;
+    while (!(x & (static_cast<uint64_t>(1) << 63))) {
+        x <<= 1;
+        ++r;
+    }
+
+    return r;
+#endif // _MSC_VER
+}
+
+#define RAPIDJSON_CLZLL RAPIDJSON_NAMESPACE::internal::clzll
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_CLZLL_H_
diff --git a/include/rapidjson/internal/diyfp.h b/include/rapidjson/internal/diyfp.h
new file mode 100644
index 0000000000..1f60fb60ca
--- /dev/null
+++ b/include/rapidjson/internal/diyfp.h
@@ -0,0 +1,261 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+// This is a C++ header-only implementation of Grisu2 algorithm from the publication:
+// Loitsch, Florian. "Printing floating-point numbers quickly and accurately with
+// integers." ACM Sigplan Notices 45.6 (2010): 233-243.
+
+#ifndef RAPIDJSON_DIYFP_H_
+#define RAPIDJSON_DIYFP_H_
+
+#include "../rapidjson.h"
+#include "clzll.h"
+#include <limits>
+
+#if defined(_MSC_VER) && defined(_M_AMD64) && !defined(__INTEL_COMPILER)
+#include <intrin.h>
+#if !defined(_ARM64EC_)
+#pragma intrinsic(_umul128)
+#else
+#pragma comment(lib,"softintrin")
+#endif
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++)
+#endif
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(padded)
+#endif
+
+struct DiyFp {
+    DiyFp() : f(), e() {}
+
+    DiyFp(uint64_t fp, int exp) : f(fp), e(exp) {}
+
+    explicit DiyFp(double d) {
+        union {
+            double d;
+            uint64_t u64;
+        } u = { d };
+
+        int biased_e = static_cast<int>((u.u64 & kDpExponentMask) >> kDpSignificandSize);
+        uint64_t significand = (u.u64 & kDpSignificandMask);
+        if (biased_e != 0) {
+            f = significand + kDpHiddenBit;
+            e = biased_e - kDpExponentBias;
+        }
+        else {
+            f = significand;
+            e = kDpMinExponent + 1;
+        }
+    }
+
+    DiyFp operator-(const DiyFp& rhs) const {
+        return DiyFp(f - rhs.f, e);
+    }
+
+    DiyFp operator*(const DiyFp& rhs) const {
+#if defined(_MSC_VER) && defined(_M_AMD64)
+        uint64_t h;
+        uint64_t l = _umul128(f, rhs.f, &h);
+        if (l & (uint64_t(1) << 63)) // rounding
+            h++;
+        return DiyFp(h, e + rhs.e + 64);
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) && defined(__x86_64__)
+        __extension__ typedef unsigned __int128 uint128;
+        uint128 p = static_cast<uint128>(f) * static_cast<uint128>(rhs.f);
+        uint64_t h = static_cast<uint64_t>(p >> 64);
+        uint64_t l = static_cast<uint64_t>(p);
+        if (l & (uint64_t(1) << 63)) // rounding
+            h++;
+        return DiyFp(h, e + rhs.e + 64);
+#else
+        const uint64_t M32 = 0xFFFFFFFF;
+        const uint64_t a = f >> 32;
+        const uint64_t b = f & M32;
+        const uint64_t c = rhs.f >> 32;
+        const uint64_t d = rhs.f & M32;
+        const uint64_t ac = a * c;
+        const uint64_t bc = b * c;
+        const uint64_t ad = a * d;
+        const uint64_t bd = b * d;
+        uint64_t tmp = (bd >> 32) + (ad & M32) + (bc & M32);
+        tmp += 1U << 31;  /// mult_round
+        return DiyFp(ac + (ad >> 32) + (bc >> 32) + (tmp >> 32), e + rhs.e + 64);
+#endif
+    }
+
+    DiyFp Normalize() const {
+        int s = static_cast<int>(clzll(f));
+        return DiyFp(f << s, e - s);
+    }
+
+    DiyFp NormalizeBoundary() const {
+        DiyFp res = *this;
+        while (!(res.f & (kDpHiddenBit << 1))) {
+            res.f <<= 1;
+            res.e--;
+        }
+        res.f <<= (kDiySignificandSize - kDpSignificandSize - 2);
+        res.e = res.e - (kDiySignificandSize - kDpSignificandSize - 2);
+        return res;
+    }
+
+    void NormalizedBoundaries(DiyFp* minus, DiyFp* plus) const {
+        DiyFp pl = DiyFp((f << 1) + 1, e - 1).NormalizeBoundary();
+        DiyFp mi = (f == kDpHiddenBit) ? DiyFp((f << 2) - 1, e - 2) : DiyFp((f << 1) - 1, e - 1);
+        mi.f <<= mi.e - pl.e;
+        mi.e = pl.e;
+        *plus = pl;
+        *minus = mi;
+    }
+
+    double ToDouble() const {
+        union {
+            double d;
+            uint64_t u64;
+        }u;
+        RAPIDJSON_ASSERT(f <= kDpHiddenBit + kDpSignificandMask);
+        if (e < kDpDenormalExponent) {
+            // Underflow.
+            return 0.0;
+        }
+        if (e >= kDpMaxExponent) {
+            // Overflow.
+            return std::numeric_limits<double>::infinity();
+        }
+        const uint64_t be = (e == kDpDenormalExponent && (f & kDpHiddenBit) == 0) ? 0 :
+            static_cast<uint64_t>(e + kDpExponentBias);
+        u.u64 = (f & kDpSignificandMask) | (be << kDpSignificandSize);
+        return u.d;
+    }
+
+    static const int kDiySignificandSize = 64;
+    static const int kDpSignificandSize = 52;
+    static const int kDpExponentBias = 0x3FF + kDpSignificandSize;
+    static const int kDpMaxExponent = 0x7FF - kDpExponentBias;
+    static const int kDpMinExponent = -kDpExponentBias;
+    static const int kDpDenormalExponent = -kDpExponentBias + 1;
+    static const uint64_t kDpExponentMask = RAPIDJSON_UINT64_C2(0x7FF00000, 0x00000000);
+    static const uint64_t kDpSignificandMask = RAPIDJSON_UINT64_C2(0x000FFFFF, 0xFFFFFFFF);
+    static const uint64_t kDpHiddenBit = RAPIDJSON_UINT64_C2(0x00100000, 0x00000000);
+
+    uint64_t f;
+    int e;
+};
+
+inline DiyFp GetCachedPowerByIndex(size_t index) {
+    // 10^-348, 10^-340, ..., 10^340
+    static const uint64_t kCachedPowers_F[] = {
+        RAPIDJSON_UINT64_C2(0xfa8fd5a0, 0x081c0288), RAPIDJSON_UINT64_C2(0xbaaee17f, 0xa23ebf76),
+        RAPIDJSON_UINT64_C2(0x8b16fb20, 0x3055ac76), RAPIDJSON_UINT64_C2(0xcf42894a, 0x5dce35ea),
+        RAPIDJSON_UINT64_C2(0x9a6bb0aa, 0x55653b2d), RAPIDJSON_UINT64_C2(0xe61acf03, 0x3d1a45df),
+        RAPIDJSON_UINT64_C2(0xab70fe17, 0xc79ac6ca), RAPIDJSON_UINT64_C2(0xff77b1fc, 0xbebcdc4f),
+        RAPIDJSON_UINT64_C2(0xbe5691ef, 0x416bd60c), RAPIDJSON_UINT64_C2(0x8dd01fad, 0x907ffc3c),
+        RAPIDJSON_UINT64_C2(0xd3515c28, 0x31559a83), RAPIDJSON_UINT64_C2(0x9d71ac8f, 0xada6c9b5),
+        RAPIDJSON_UINT64_C2(0xea9c2277, 0x23ee8bcb), RAPIDJSON_UINT64_C2(0xaecc4991, 0x4078536d),
+        RAPIDJSON_UINT64_C2(0x823c1279, 0x5db6ce57), RAPIDJSON_UINT64_C2(0xc2109436, 0x4dfb5637),
+        RAPIDJSON_UINT64_C2(0x9096ea6f, 0x3848984f), RAPIDJSON_UINT64_C2(0xd77485cb, 0x25823ac7),
+        RAPIDJSON_UINT64_C2(0xa086cfcd, 0x97bf97f4), RAPIDJSON_UINT64_C2(0xef340a98, 0x172aace5),
+        RAPIDJSON_UINT64_C2(0xb23867fb, 0x2a35b28e), RAPIDJSON_UINT64_C2(0x84c8d4df, 0xd2c63f3b),
+        RAPIDJSON_UINT64_C2(0xc5dd4427, 0x1ad3cdba), RAPIDJSON_UINT64_C2(0x936b9fce, 0xbb25c996),
+        RAPIDJSON_UINT64_C2(0xdbac6c24, 0x7d62a584), RAPIDJSON_UINT64_C2(0xa3ab6658, 0x0d5fdaf6),
+        RAPIDJSON_UINT64_C2(0xf3e2f893, 0xdec3f126), RAPIDJSON_UINT64_C2(0xb5b5ada8, 0xaaff80b8),
+        RAPIDJSON_UINT64_C2(0x87625f05, 0x6c7c4a8b), RAPIDJSON_UINT64_C2(0xc9bcff60, 0x34c13053),
+        RAPIDJSON_UINT64_C2(0x964e858c, 0x91ba2655), RAPIDJSON_UINT64_C2(0xdff97724, 0x70297ebd),
+        RAPIDJSON_UINT64_C2(0xa6dfbd9f, 0xb8e5b88f), RAPIDJSON_UINT64_C2(0xf8a95fcf, 0x88747d94),
+        RAPIDJSON_UINT64_C2(0xb9447093, 0x8fa89bcf), RAPIDJSON_UINT64_C2(0x8a08f0f8, 0xbf0f156b),
+        RAPIDJSON_UINT64_C2(0xcdb02555, 0x653131b6), RAPIDJSON_UINT64_C2(0x993fe2c6, 0xd07b7fac),
+        RAPIDJSON_UINT64_C2(0xe45c10c4, 0x2a2b3b06), RAPIDJSON_UINT64_C2(0xaa242499, 0x697392d3),
+        RAPIDJSON_UINT64_C2(0xfd87b5f2, 0x8300ca0e), RAPIDJSON_UINT64_C2(0xbce50864, 0x92111aeb),
+        RAPIDJSON_UINT64_C2(0x8cbccc09, 0x6f5088cc), RAPIDJSON_UINT64_C2(0xd1b71758, 0xe219652c),
+        RAPIDJSON_UINT64_C2(0x9c400000, 0x00000000), RAPIDJSON_UINT64_C2(0xe8d4a510, 0x00000000),
+        RAPIDJSON_UINT64_C2(0xad78ebc5, 0xac620000), RAPIDJSON_UINT64_C2(0x813f3978, 0xf8940984),
+        RAPIDJSON_UINT64_C2(0xc097ce7b, 0xc90715b3), RAPIDJSON_UINT64_C2(0x8f7e32ce, 0x7bea5c70),
+        RAPIDJSON_UINT64_C2(0xd5d238a4, 0xabe98068), RAPIDJSON_UINT64_C2(0x9f4f2726, 0x179a2245),
+        RAPIDJSON_UINT64_C2(0xed63a231, 0xd4c4fb27), RAPIDJSON_UINT64_C2(0xb0de6538, 0x8cc8ada8),
+        RAPIDJSON_UINT64_C2(0x83c7088e, 0x1aab65db), RAPIDJSON_UINT64_C2(0xc45d1df9, 0x42711d9a),
+        RAPIDJSON_UINT64_C2(0x924d692c, 0xa61be758), RAPIDJSON_UINT64_C2(0xda01ee64, 0x1a708dea),
+        RAPIDJSON_UINT64_C2(0xa26da399, 0x9aef774a), RAPIDJSON_UINT64_C2(0xf209787b, 0xb47d6b85),
+        RAPIDJSON_UINT64_C2(0xb454e4a1, 0x79dd1877), RAPIDJSON_UINT64_C2(0x865b8692, 0x5b9bc5c2),
+        RAPIDJSON_UINT64_C2(0xc83553c5, 0xc8965d3d), RAPIDJSON_UINT64_C2(0x952ab45c, 0xfa97a0b3),
+        RAPIDJSON_UINT64_C2(0xde469fbd, 0x99a05fe3), RAPIDJSON_UINT64_C2(0xa59bc234, 0xdb398c25),
+        RAPIDJSON_UINT64_C2(0xf6c69a72, 0xa3989f5c), RAPIDJSON_UINT64_C2(0xb7dcbf53, 0x54e9bece),
+        RAPIDJSON_UINT64_C2(0x88fcf317, 0xf22241e2), RAPIDJSON_UINT64_C2(0xcc20ce9b, 0xd35c78a5),
+        RAPIDJSON_UINT64_C2(0x98165af3, 0x7b2153df), RAPIDJSON_UINT64_C2(0xe2a0b5dc, 0x971f303a),
+        RAPIDJSON_UINT64_C2(0xa8d9d153, 0x5ce3b396), RAPIDJSON_UINT64_C2(0xfb9b7cd9, 0xa4a7443c),
+        RAPIDJSON_UINT64_C2(0xbb764c4c, 0xa7a44410), RAPIDJSON_UINT64_C2(0x8bab8eef, 0xb6409c1a),
+        RAPIDJSON_UINT64_C2(0xd01fef10, 0xa657842c), RAPIDJSON_UINT64_C2(0x9b10a4e5, 0xe9913129),
+        RAPIDJSON_UINT64_C2(0xe7109bfb, 0xa19c0c9d), RAPIDJSON_UINT64_C2(0xac2820d9, 0x623bf429),
+        RAPIDJSON_UINT64_C2(0x80444b5e, 0x7aa7cf85), RAPIDJSON_UINT64_C2(0xbf21e440, 0x03acdd2d),
+        RAPIDJSON_UINT64_C2(0x8e679c2f, 0x5e44ff8f), RAPIDJSON_UINT64_C2(0xd433179d, 0x9c8cb841),
+        RAPIDJSON_UINT64_C2(0x9e19db92, 0xb4e31ba9), RAPIDJSON_UINT64_C2(0xeb96bf6e, 0xbadf77d9),
+        RAPIDJSON_UINT64_C2(0xaf87023b, 0x9bf0ee6b)
+    };
+    static const int16_t kCachedPowers_E[] = {
+        -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007,  -980,
+        -954,  -927,  -901,  -874,  -847,  -821,  -794,  -768,  -741,  -715,
+        -688,  -661,  -635,  -608,  -582,  -555,  -529,  -502,  -475,  -449,
+        -422,  -396,  -369,  -343,  -316,  -289,  -263,  -236,  -210,  -183,
+        -157,  -130,  -103,   -77,   -50,   -24,     3,    30,    56,    83,
+        109,   136,   162,   189,   216,   242,   269,   295,   322,   348,
+        375,   402,   428,   455,   481,   508,   534,   561,   588,   614,
+        641,   667,   694,   720,   747,   774,   800,   827,   853,   880,
+        907,   933,   960,   986,  1013,  1039,  1066
+    };
+    RAPIDJSON_ASSERT(index < 87);
+    return DiyFp(kCachedPowers_F[index], kCachedPowers_E[index]);
+}
+
+inline DiyFp GetCachedPower(int e, int* K) {
+
+    //int k = static_cast<int>(ceil((-61 - e) * 0.30102999566398114)) + 374;
+    double dk = (-61 - e) * 0.30102999566398114 + 347;  // dk must be positive, so can do ceiling in positive
+    int k = static_cast<int>(dk);
+    if (dk - k > 0.0)
+        k++;
+
+    unsigned index = static_cast<unsigned>((k >> 3) + 1);
+    *K = -(-348 + static_cast<int>(index << 3));    // decimal exponent no need lookup table
+
+    return GetCachedPowerByIndex(index);
+}
+
+inline DiyFp GetCachedPower10(int exp, int *outExp) {
+    RAPIDJSON_ASSERT(exp >= -348);
+    unsigned index = static_cast<unsigned>(exp + 348) / 8u;
+    *outExp = -348 + static_cast<int>(index) * 8;
+    return GetCachedPowerByIndex(index);
+}
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_POP
+#endif
+
+#ifdef __clang__
+RAPIDJSON_DIAG_POP
+RAPIDJSON_DIAG_OFF(padded)
+#endif
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_DIYFP_H_
diff --git a/include/rapidjson/internal/dtoa.h b/include/rapidjson/internal/dtoa.h
new file mode 100644
index 0000000000..cd456721a7
--- /dev/null
+++ b/include/rapidjson/internal/dtoa.h
@@ -0,0 +1,249 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+// This is a C++ header-only implementation of Grisu2 algorithm from the publication:
+// Loitsch, Florian. "Printing floating-point numbers quickly and accurately with
+// integers." ACM Sigplan Notices 45.6 (2010): 233-243.
+
+#ifndef RAPIDJSON_DTOA_
+#define RAPIDJSON_DTOA_
+
+#include "itoa.h" // GetDigitsLut()
+#include "diyfp.h"
+#include "ieee754.h"
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++)
+RAPIDJSON_DIAG_OFF(array-bounds) // some gcc versions generate wrong warnings https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
+#endif
+
+inline void GrisuRound(char* buffer, int len, uint64_t delta, uint64_t rest, uint64_t ten_kappa, uint64_t wp_w) {
+    while (rest < wp_w && delta - rest >= ten_kappa &&
+           (rest + ten_kappa < wp_w ||  /// closer
+            wp_w - rest > rest + ten_kappa - wp_w)) {
+        buffer[len - 1]--;
+        rest += ten_kappa;
+    }
+}
+
+inline int CountDecimalDigit32(uint32_t n) {
+    // Simple pure C++ implementation was faster than __builtin_clz version in this situation.
+    if (n < 10) return 1;
+    if (n < 100) return 2;
+    if (n < 1000) return 3;
+    if (n < 10000) return 4;
+    if (n < 100000) return 5;
+    if (n < 1000000) return 6;
+    if (n < 10000000) return 7;
+    if (n < 100000000) return 8;
+    // Will not reach 10 digits in DigitGen()
+    //if (n < 1000000000) return 9;
+    //return 10;
+    return 9;
+}
+
+inline void DigitGen(const DiyFp& W, const DiyFp& Mp, uint64_t delta, char* buffer, int* len, int* K) {
+    static const uint64_t kPow10[] = { 1ULL, 10ULL, 100ULL, 1000ULL, 10000ULL, 100000ULL, 1000000ULL, 10000000ULL, 100000000ULL,
+                                       1000000000ULL, 10000000000ULL, 100000000000ULL, 1000000000000ULL,
+                                       10000000000000ULL, 100000000000000ULL, 1000000000000000ULL,
+                                       10000000000000000ULL, 100000000000000000ULL, 1000000000000000000ULL,
+                                       10000000000000000000ULL };
+    const DiyFp one(uint64_t(1) << -Mp.e, Mp.e);
+    const DiyFp wp_w = Mp - W;
+    uint32_t p1 = static_cast<uint32_t>(Mp.f >> -one.e);
+    uint64_t p2 = Mp.f & (one.f - 1);
+    int kappa = CountDecimalDigit32(p1); // kappa in [0, 9]
+    *len = 0;
+
+    while (kappa > 0) {
+        uint32_t d = 0;
+        switch (kappa) {
+            case  9: d = p1 /  100000000; p1 %=  100000000; break;
+            case  8: d = p1 /   10000000; p1 %=   10000000; break;
+            case  7: d = p1 /    1000000; p1 %=    1000000; break;
+            case  6: d = p1 /     100000; p1 %=     100000; break;
+            case  5: d = p1 /      10000; p1 %=      10000; break;
+            case  4: d = p1 /       1000; p1 %=       1000; break;
+            case  3: d = p1 /        100; p1 %=        100; break;
+            case  2: d = p1 /         10; p1 %=         10; break;
+            case  1: d = p1;              p1 =           0; break;
+            default:;
+        }
+        if (d || *len)
+            buffer[(*len)++] = static_cast<char>('0' + static_cast<char>(d));
+        kappa--;
+        uint64_t tmp = (static_cast<uint64_t>(p1) << -one.e) + p2;
+        if (tmp <= delta) {
+            *K += kappa;
+            GrisuRound(buffer, *len, delta, tmp, kPow10[kappa] << -one.e, wp_w.f);
+            return;
+        }
+    }
+
+    // kappa = 0
+    for (;;) {
+        p2 *= 10;
+        delta *= 10;
+        char d = static_cast<char>(p2 >> -one.e);
+        if (d || *len)
+            buffer[(*len)++] = static_cast<char>('0' + d);
+        p2 &= one.f - 1;
+        kappa--;
+        if (p2 < delta) {
+            *K += kappa;
+            int index = -kappa;
+            GrisuRound(buffer, *len, delta, p2, one.f, wp_w.f * (index < 20 ? kPow10[index] : 0));
+            return;
+        }
+    }
+}
+
+inline void Grisu2(double value, char* buffer, int* length, int* K) {
+    const DiyFp v(value);
+    DiyFp w_m, w_p;
+    v.NormalizedBoundaries(&w_m, &w_p);
+
+    const DiyFp c_mk = GetCachedPower(w_p.e, K);
+    const DiyFp W = v.Normalize() * c_mk;
+    DiyFp Wp = w_p * c_mk;
+    DiyFp Wm = w_m * c_mk;
+    Wm.f++;
+    Wp.f--;
+    DigitGen(W, Wp, Wp.f - Wm.f, buffer, length, K);
+}
+
+inline char* WriteExponent(int K, char* buffer) {
+    if (K < 0) {
+        *buffer++ = '-';
+        K = -K;
+    }
+
+    if (K >= 100) {
+        *buffer++ = static_cast<char>('0' + static_cast<char>(K / 100));
+        K %= 100;
+        const char* d = GetDigitsLut() + K * 2;
+        *buffer++ = d[0];
+        *buffer++ = d[1];
+    }
+    else if (K >= 10) {
+        const char* d = GetDigitsLut() + K * 2;
+        *buffer++ = d[0];
+        *buffer++ = d[1];
+    }
+    else
+        *buffer++ = static_cast<char>('0' + static_cast<char>(K));
+
+    return buffer;
+}
+
+inline char* Prettify(char* buffer, int length, int k, int maxDecimalPlaces) {
+    const int kk = length + k;  // 10^(kk-1) <= v < 10^kk
+
+    if (0 <= k && kk <= 21) {
+        // 1234e7 -> 12340000000
+        for (int i = length; i < kk; i++)
+            buffer[i] = '0';
+        buffer[kk] = '.';
+        buffer[kk + 1] = '0';
+        return &buffer[kk + 2];
+    }
+    else if (0 < kk && kk <= 21) {
+        // 1234e-2 -> 12.34
+        std::memmove(&buffer[kk + 1], &buffer[kk], static_cast<size_t>(length - kk));
+        buffer[kk] = '.';
+        if (0 > k + maxDecimalPlaces) {
+            // When maxDecimalPlaces = 2, 1.2345 -> 1.23, 1.102 -> 1.1
+            // Remove extra trailing zeros (at least one) after truncation.
+            for (int i = kk + maxDecimalPlaces; i > kk + 1; i--)
+                if (buffer[i] != '0')
+                    return &buffer[i + 1];
+            return &buffer[kk + 2]; // Reserve one zero
+        }
+        else
+            return &buffer[length + 1];
+    }
+    else if (-6 < kk && kk <= 0) {
+        // 1234e-6 -> 0.001234
+        const int offset = 2 - kk;
+        std::memmove(&buffer[offset], &buffer[0], static_cast<size_t>(length));
+        buffer[0] = '0';
+        buffer[1] = '.';
+        for (int i = 2; i < offset; i++)
+            buffer[i] = '0';
+        if (length - kk > maxDecimalPlaces) {
+            // When maxDecimalPlaces = 2, 0.123 -> 0.12, 0.102 -> 0.1
+            // Remove extra trailing zeros (at least one) after truncation.
+            for (int i = maxDecimalPlaces + 1; i > 2; i--)
+                if (buffer[i] != '0')
+                    return &buffer[i + 1];
+            return &buffer[3]; // Reserve one zero
+        }
+        else
+            return &buffer[length + offset];
+    }
+    else if (kk < -maxDecimalPlaces) {
+        // Truncate to zero
+        buffer[0] = '0';
+        buffer[1] = '.';
+        buffer[2] = '0';
+        return &buffer[3];
+    }
+    else if (length == 1) {
+        // 1e30
+        buffer[1] = 'e';
+        return WriteExponent(kk - 1, &buffer[2]);
+    }
+    else {
+        // 1234e30 -> 1.234e33
+        std::memmove(&buffer[2], &buffer[1], static_cast<size_t>(length - 1));
+        buffer[1] = '.';
+        buffer[length + 1] = 'e';
+        return WriteExponent(kk - 1, &buffer[0 + length + 2]);
+    }
+}
+
+inline char* dtoa(double value, char* buffer, int maxDecimalPlaces = 324) {
+    RAPIDJSON_ASSERT(maxDecimalPlaces >= 1);
+    Double d(value);
+    if (d.IsZero()) {
+        if (d.Sign())
+            *buffer++ = '-';     // -0.0, Issue #289
+        buffer[0] = '0';
+        buffer[1] = '.';
+        buffer[2] = '0';
+        return &buffer[3];
+    }
+    else {
+        if (value < 0) {
+            *buffer++ = '-';
+            value = -value;
+        }
+        int length, K;
+        Grisu2(value, buffer, &length, &K);
+        return Prettify(buffer, length, K, maxDecimalPlaces);
+    }
+}
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_POP
+#endif
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_DTOA_
diff --git a/include/rapidjson/internal/ieee754.h b/include/rapidjson/internal/ieee754.h
new file mode 100644
index 0000000000..68c9e96649
--- /dev/null
+++ b/include/rapidjson/internal/ieee754.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_IEEE754_
+#define RAPIDJSON_IEEE754_
+
+#include "../rapidjson.h"
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+class Double {
+public:
+    Double() {}
+    Double(double d) : d_(d) {}
+    Double(uint64_t u) : u_(u) {}
+
+    double Value() const { return d_; }
+    uint64_t Uint64Value() const { return u_; }
+
+    double NextPositiveDouble() const {
+        RAPIDJSON_ASSERT(!Sign());
+        return Double(u_ + 1).Value();
+    }
+
+    bool Sign() const { return (u_ & kSignMask) != 0; }
+    uint64_t Significand() const { return u_ & kSignificandMask; }
+    int Exponent() const { return static_cast<int>(((u_ & kExponentMask) >> kSignificandSize) - kExponentBias); }
+
+    bool IsNan() const { return (u_ & kExponentMask) == kExponentMask && Significand() != 0; }
+    bool IsInf() const { return (u_ & kExponentMask) == kExponentMask && Significand() == 0; }
+    bool IsNanOrInf() const { return (u_ & kExponentMask) == kExponentMask; }
+    bool IsNormal() const { return (u_ & kExponentMask) != 0 || Significand() == 0; }
+    bool IsZero() const { return (u_ & (kExponentMask | kSignificandMask)) == 0; }
+
+    uint64_t IntegerSignificand() const { return IsNormal() ? Significand() | kHiddenBit : Significand(); }
+    int IntegerExponent() const { return (IsNormal() ? Exponent() : kDenormalExponent) - kSignificandSize; }
+    uint64_t ToBias() const { return (u_ & kSignMask) ? ~u_ + 1 : u_ | kSignMask; }
+
+    static int EffectiveSignificandSize(int order) {
+        if (order >= -1021)
+            return 53;
+        else if (order <= -1074)
+            return 0;
+        else
+            return order + 1074;
+    }
+
+private:
+    static const int kSignificandSize = 52;
+    static const int kExponentBias = 0x3FF;
+    static const int kDenormalExponent = 1 - kExponentBias;
+    static const uint64_t kSignMask = RAPIDJSON_UINT64_C2(0x80000000, 0x00000000);
+    static const uint64_t kExponentMask = RAPIDJSON_UINT64_C2(0x7FF00000, 0x00000000);
+    static const uint64_t kSignificandMask = RAPIDJSON_UINT64_C2(0x000FFFFF, 0xFFFFFFFF);
+    static const uint64_t kHiddenBit = RAPIDJSON_UINT64_C2(0x00100000, 0x00000000);
+
+    union {
+        double d_;
+        uint64_t u_;
+    };
+};
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_IEEE754_
diff --git a/include/rapidjson/internal/itoa.h b/include/rapidjson/internal/itoa.h
new file mode 100644
index 0000000000..9fe8c932ff
--- /dev/null
+++ b/include/rapidjson/internal/itoa.h
@@ -0,0 +1,308 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_ITOA_
+#define RAPIDJSON_ITOA_
+
+#include "../rapidjson.h"
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+inline const char* GetDigitsLut() {
+    static const char cDigitsLut[200] = {
+        '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9',
+        '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9',
+        '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9',
+        '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9',
+        '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9',
+        '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9',
+        '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9',
+        '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9',
+        '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9',
+        '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9'
+    };
+    return cDigitsLut;
+}
+
+inline char* u32toa(uint32_t value, char* buffer) {
+    RAPIDJSON_ASSERT(buffer != 0);
+
+    const char* cDigitsLut = GetDigitsLut();
+
+    if (value < 10000) {
+        const uint32_t d1 = (value / 100) << 1;
+        const uint32_t d2 = (value % 100) << 1;
+
+        if (value >= 1000)
+            *buffer++ = cDigitsLut[d1];
+        if (value >= 100)
+            *buffer++ = cDigitsLut[d1 + 1];
+        if (value >= 10)
+            *buffer++ = cDigitsLut[d2];
+        *buffer++ = cDigitsLut[d2 + 1];
+    }
+    else if (value < 100000000) {
+        // value = bbbbcccc
+        const uint32_t b = value / 10000;
+        const uint32_t c = value % 10000;
+
+        const uint32_t d1 = (b / 100) << 1;
+        const uint32_t d2 = (b % 100) << 1;
+
+        const uint32_t d3 = (c / 100) << 1;
+        const uint32_t d4 = (c % 100) << 1;
+
+        if (value >= 10000000)
+            *buffer++ = cDigitsLut[d1];
+        if (value >= 1000000)
+            *buffer++ = cDigitsLut[d1 + 1];
+        if (value >= 100000)
+            *buffer++ = cDigitsLut[d2];
+        *buffer++ = cDigitsLut[d2 + 1];
+
+        *buffer++ = cDigitsLut[d3];
+        *buffer++ = cDigitsLut[d3 + 1];
+        *buffer++ = cDigitsLut[d4];
+        *buffer++ = cDigitsLut[d4 + 1];
+    }
+    else {
+        // value = aabbbbcccc in decimal
+
+        const uint32_t a = value / 100000000; // 1 to 42
+        value %= 100000000;
+
+        if (a >= 10) {
+            const unsigned i = a << 1;
+            *buffer++ = cDigitsLut[i];
+            *buffer++ = cDigitsLut[i + 1];
+        }
+        else
+            *buffer++ = static_cast<char>('0' + static_cast<char>(a));
+
+        const uint32_t b = value / 10000; // 0 to 9999
+        const uint32_t c = value % 10000; // 0 to 9999
+
+        const uint32_t d1 = (b / 100) << 1;
+        const uint32_t d2 = (b % 100) << 1;
+
+        const uint32_t d3 = (c / 100) << 1;
+        const uint32_t d4 = (c % 100) << 1;
+
+        *buffer++ = cDigitsLut[d1];
+        *buffer++ = cDigitsLut[d1 + 1];
+        *buffer++ = cDigitsLut[d2];
+        *buffer++ = cDigitsLut[d2 + 1];
+        *buffer++ = cDigitsLut[d3];
+        *buffer++ = cDigitsLut[d3 + 1];
+        *buffer++ = cDigitsLut[d4];
+        *buffer++ = cDigitsLut[d4 + 1];
+    }
+    return buffer;
+}
+
+inline char* i32toa(int32_t value, char* buffer) {
+    RAPIDJSON_ASSERT(buffer != 0);
+    uint32_t u = static_cast<uint32_t>(value);
+    if (value < 0) {
+        *buffer++ = '-';
+        u = ~u + 1;
+    }
+
+    return u32toa(u, buffer);
+}
+
+inline char* u64toa(uint64_t value, char* buffer) {
+    RAPIDJSON_ASSERT(buffer != 0);
+    const char* cDigitsLut = GetDigitsLut();
+    const uint64_t  kTen8 = 100000000;
+    const uint64_t  kTen9 = kTen8 * 10;
+    const uint64_t kTen10 = kTen8 * 100;
+    const uint64_t kTen11 = kTen8 * 1000;
+    const uint64_t kTen12 = kTen8 * 10000;
+    const uint64_t kTen13 = kTen8 * 100000;
+    const uint64_t kTen14 = kTen8 * 1000000;
+    const uint64_t kTen15 = kTen8 * 10000000;
+    const uint64_t kTen16 = kTen8 * kTen8;
+
+    if (value < kTen8) {
+        uint32_t v = static_cast<uint32_t>(value);
+        if (v < 10000) {
+            const uint32_t d1 = (v / 100) << 1;
+            const uint32_t d2 = (v % 100) << 1;
+
+            if (v >= 1000)
+                *buffer++ = cDigitsLut[d1];
+            if (v >= 100)
+                *buffer++ = cDigitsLut[d1 + 1];
+            if (v >= 10)
+                *buffer++ = cDigitsLut[d2];
+            *buffer++ = cDigitsLut[d2 + 1];
+        }
+        else {
+            // value = bbbbcccc
+            const uint32_t b = v / 10000;
+            const uint32_t c = v % 10000;
+
+            const uint32_t d1 = (b / 100) << 1;
+            const uint32_t d2 = (b % 100) << 1;
+
+            const uint32_t d3 = (c / 100) << 1;
+            const uint32_t d4 = (c % 100) << 1;
+
+            if (value >= 10000000)
+                *buffer++ = cDigitsLut[d1];
+            if (value >= 1000000)
+                *buffer++ = cDigitsLut[d1 + 1];
+            if (value >= 100000)
+                *buffer++ = cDigitsLut[d2];
+            *buffer++ = cDigitsLut[d2 + 1];
+
+            *buffer++ = cDigitsLut[d3];
+            *buffer++ = cDigitsLut[d3 + 1];
+            *buffer++ = cDigitsLut[d4];
+            *buffer++ = cDigitsLut[d4 + 1];
+        }
+    }
+    else if (value < kTen16) {
+        const uint32_t v0 = static_cast<uint32_t>(value / kTen8);
+        const uint32_t v1 = static_cast<uint32_t>(value % kTen8);
+
+        const uint32_t b0 = v0 / 10000;
+        const uint32_t c0 = v0 % 10000;
+
+        const uint32_t d1 = (b0 / 100) << 1;
+        const uint32_t d2 = (b0 % 100) << 1;
+
+        const uint32_t d3 = (c0 / 100) << 1;
+        const uint32_t d4 = (c0 % 100) << 1;
+
+        const uint32_t b1 = v1 / 10000;
+        const uint32_t c1 = v1 % 10000;
+
+        const uint32_t d5 = (b1 / 100) << 1;
+        const uint32_t d6 = (b1 % 100) << 1;
+
+        const uint32_t d7 = (c1 / 100) << 1;
+        const uint32_t d8 = (c1 % 100) << 1;
+
+        if (value >= kTen15)
+            *buffer++ = cDigitsLut[d1];
+        if (value >= kTen14)
+            *buffer++ = cDigitsLut[d1 + 1];
+        if (value >= kTen13)
+            *buffer++ = cDigitsLut[d2];
+        if (value >= kTen12)
+            *buffer++ = cDigitsLut[d2 + 1];
+        if (value >= kTen11)
+            *buffer++ = cDigitsLut[d3];
+        if (value >= kTen10)
+            *buffer++ = cDigitsLut[d3 + 1];
+        if (value >= kTen9)
+            *buffer++ = cDigitsLut[d4];
+
+        *buffer++ = cDigitsLut[d4 + 1];
+        *buffer++ = cDigitsLut[d5];
+        *buffer++ = cDigitsLut[d5 + 1];
+        *buffer++ = cDigitsLut[d6];
+        *buffer++ = cDigitsLut[d6 + 1];
+        *buffer++ = cDigitsLut[d7];
+        *buffer++ = cDigitsLut[d7 + 1];
+        *buffer++ = cDigitsLut[d8];
+        *buffer++ = cDigitsLut[d8 + 1];
+    }
+    else {
+        const uint32_t a = static_cast<uint32_t>(value / kTen16); // 1 to 1844
+        value %= kTen16;
+
+        if (a < 10)
+            *buffer++ = static_cast<char>('0' + static_cast<char>(a));
+        else if (a < 100) {
+            const uint32_t i = a << 1;
+            *buffer++ = cDigitsLut[i];
+            *buffer++ = cDigitsLut[i + 1];
+        }
+        else if (a < 1000) {
+            *buffer++ = static_cast<char>('0' + static_cast<char>(a / 100));
+
+            const uint32_t i = (a % 100) << 1;
+            *buffer++ = cDigitsLut[i];
+            *buffer++ = cDigitsLut[i + 1];
+        }
+        else {
+            const uint32_t i = (a / 100) << 1;
+            const uint32_t j = (a % 100) << 1;
+            *buffer++ = cDigitsLut[i];
+            *buffer++ = cDigitsLut[i + 1];
+            *buffer++ = cDigitsLut[j];
+            *buffer++ = cDigitsLut[j + 1];
+        }
+
+        const uint32_t v0 = static_cast<uint32_t>(value / kTen8);
+        const uint32_t v1 = static_cast<uint32_t>(value % kTen8);
+
+        const uint32_t b0 = v0 / 10000;
+        const uint32_t c0 = v0 % 10000;
+
+        const uint32_t d1 = (b0 / 100) << 1;
+        const uint32_t d2 = (b0 % 100) << 1;
+
+        const uint32_t d3 = (c0 / 100) << 1;
+        const uint32_t d4 = (c0 % 100) << 1;
+
+        const uint32_t b1 = v1 / 10000;
+        const uint32_t c1 = v1 % 10000;
+
+        const uint32_t d5 = (b1 / 100) << 1;
+        const uint32_t d6 = (b1 % 100) << 1;
+
+        const uint32_t d7 = (c1 / 100) << 1;
+        const uint32_t d8 = (c1 % 100) << 1;
+
+        *buffer++ = cDigitsLut[d1];
+        *buffer++ = cDigitsLut[d1 + 1];
+        *buffer++ = cDigitsLut[d2];
+        *buffer++ = cDigitsLut[d2 + 1];
+        *buffer++ = cDigitsLut[d3];
+        *buffer++ = cDigitsLut[d3 + 1];
+        *buffer++ = cDigitsLut[d4];
+        *buffer++ = cDigitsLut[d4 + 1];
+        *buffer++ = cDigitsLut[d5];
+        *buffer++ = cDigitsLut[d5 + 1];
+        *buffer++ = cDigitsLut[d6];
+        *buffer++ = cDigitsLut[d6 + 1];
+        *buffer++ = cDigitsLut[d7];
+        *buffer++ = cDigitsLut[d7 + 1];
+        *buffer++ = cDigitsLut[d8];
+        *buffer++ = cDigitsLut[d8 + 1];
+    }
+
+    return buffer;
+}
+
+inline char* i64toa(int64_t value, char* buffer) {
+    RAPIDJSON_ASSERT(buffer != 0);
+    uint64_t u = static_cast<uint64_t>(value);
+    if (value < 0) {
+        *buffer++ = '-';
+        u = ~u + 1;
+    }
+
+    return u64toa(u, buffer);
+}
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_ITOA_
diff --git a/include/rapidjson/internal/meta.h b/include/rapidjson/internal/meta.h
new file mode 100644
index 0000000000..27092dc0d6
--- /dev/null
+++ b/include/rapidjson/internal/meta.h
@@ -0,0 +1,186 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_INTERNAL_META_H_
+#define RAPIDJSON_INTERNAL_META_H_
+
+#include "../rapidjson.h"
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++)
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(6334)
+#endif
+
+#if RAPIDJSON_HAS_CXX11_TYPETRAITS
+#include <type_traits>
+#endif
+
+//@cond RAPIDJSON_INTERNAL
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+// Helper to wrap/convert arbitrary types to void, useful for arbitrary type matching
+template <typename T> struct Void { typedef void Type; };
+
+///////////////////////////////////////////////////////////////////////////////
+// BoolType, TrueType, FalseType
+//
+template <bool Cond> struct BoolType {
+    static const bool Value = Cond;
+    typedef BoolType Type;
+};
+typedef BoolType<true> TrueType;
+typedef BoolType<false> FalseType;
+
+
+///////////////////////////////////////////////////////////////////////////////
+// SelectIf, BoolExpr, NotExpr, AndExpr, OrExpr
+//
+
+template <bool C> struct SelectIfImpl { template <typename T1, typename T2> struct Apply { typedef T1 Type; }; };
+template <> struct SelectIfImpl<false> { template <typename T1, typename T2> struct Apply { typedef T2 Type; }; };
+template <bool C, typename T1, typename T2> struct SelectIfCond : SelectIfImpl<C>::template Apply<T1,T2> {};
+template <typename C, typename T1, typename T2> struct SelectIf : SelectIfCond<C::Value, T1, T2> {};
+
+template <bool Cond1, bool Cond2> struct AndExprCond : FalseType {};
+template <> struct AndExprCond<true, true> : TrueType {};
+template <bool Cond1, bool Cond2> struct OrExprCond : TrueType {};
+template <> struct OrExprCond<false, false> : FalseType {};
+
+template <typename C> struct BoolExpr : SelectIf<C,TrueType,FalseType>::Type {};
+template <typename C> struct NotExpr  : SelectIf<C,FalseType,TrueType>::Type {};
+template <typename C1, typename C2> struct AndExpr : AndExprCond<C1::Value, C2::Value>::Type {};
+template <typename C1, typename C2> struct OrExpr  : OrExprCond<C1::Value, C2::Value>::Type {};
+
+
+///////////////////////////////////////////////////////////////////////////////
+// AddConst, MaybeAddConst, RemoveConst
+template <typename T> struct AddConst { typedef const T Type; };
+template <bool Constify, typename T> struct MaybeAddConst : SelectIfCond<Constify, const T, T> {};
+template <typename T> struct RemoveConst { typedef T Type; };
+template <typename T> struct RemoveConst<const T> { typedef T Type; };
+
+
+///////////////////////////////////////////////////////////////////////////////
+// IsSame, IsConst, IsMoreConst, IsPointer
+//
+template <typename T, typename U> struct IsSame : FalseType {};
+template <typename T> struct IsSame<T, T> : TrueType {};
+
+template <typename T> struct IsConst : FalseType {};
+template <typename T> struct IsConst<const T> : TrueType {};
+
+template <typename CT, typename T>
+struct IsMoreConst
+    : AndExpr<IsSame<typename RemoveConst<CT>::Type, typename RemoveConst<T>::Type>,
+              BoolType<IsConst<CT>::Value >= IsConst<T>::Value> >::Type {};
+
+template <typename T> struct IsPointer : FalseType {};
+template <typename T> struct IsPointer<T*> : TrueType {};
+
+///////////////////////////////////////////////////////////////////////////////
+// IsBaseOf
+//
+#if RAPIDJSON_HAS_CXX11_TYPETRAITS
+
+template <typename B, typename D> struct IsBaseOf
+    : BoolType< ::std::is_base_of<B,D>::value> {};
+
+#else // simplified version adopted from Boost
+
+template<typename B, typename D> struct IsBaseOfImpl {
+    RAPIDJSON_STATIC_ASSERT(sizeof(B) != 0);
+    RAPIDJSON_STATIC_ASSERT(sizeof(D) != 0);
+
+    typedef char (&Yes)[1];
+    typedef char (&No) [2];
+
+    template <typename T>
+    static Yes Check(const D*, T);
+    static No  Check(const B*, int);
+
+    struct Host {
+        operator const B*() const;
+        operator const D*();
+    };
+
+    enum { Value = (sizeof(Check(Host(), 0)) == sizeof(Yes)) };
+};
+
+template <typename B, typename D> struct IsBaseOf
+    : OrExpr<IsSame<B, D>, BoolExpr<IsBaseOfImpl<B, D> > >::Type {};
+
+#endif // RAPIDJSON_HAS_CXX11_TYPETRAITS
+
+
+//////////////////////////////////////////////////////////////////////////
+// EnableIf / DisableIf
+//
+template <bool Condition, typename T = void> struct EnableIfCond  { typedef T Type; };
+template <typename T> struct EnableIfCond<false, T> { /* empty */ };
+
+template <bool Condition, typename T = void> struct DisableIfCond { typedef T Type; };
+template <typename T> struct DisableIfCond<true, T> { /* empty */ };
+
+template <typename Condition, typename T = void>
+struct EnableIf : EnableIfCond<Condition::Value, T> {};
+
+template <typename Condition, typename T = void>
+struct DisableIf : DisableIfCond<Condition::Value, T> {};
+
+// SFINAE helpers
+struct SfinaeTag {};
+template <typename T> struct RemoveSfinaeTag;
+template <typename T> struct RemoveSfinaeTag<SfinaeTag&(*)(T)> { typedef T Type; };
+
+#define RAPIDJSON_REMOVEFPTR_(type) \
+    typename ::RAPIDJSON_NAMESPACE::internal::RemoveSfinaeTag \
+        < ::RAPIDJSON_NAMESPACE::internal::SfinaeTag&(*) type>::Type
+
+#define RAPIDJSON_ENABLEIF(cond) \
+    typename ::RAPIDJSON_NAMESPACE::internal::EnableIf \
+        <RAPIDJSON_REMOVEFPTR_(cond)>::Type * = NULL
+
+#define RAPIDJSON_DISABLEIF(cond) \
+    typename ::RAPIDJSON_NAMESPACE::internal::DisableIf \
+        <RAPIDJSON_REMOVEFPTR_(cond)>::Type * = NULL
+
+#define RAPIDJSON_ENABLEIF_RETURN(cond,returntype) \
+    typename ::RAPIDJSON_NAMESPACE::internal::EnableIf \
+        <RAPIDJSON_REMOVEFPTR_(cond), \
+         RAPIDJSON_REMOVEFPTR_(returntype)>::Type
+
+#define RAPIDJSON_DISABLEIF_RETURN(cond,returntype) \
+    typename ::RAPIDJSON_NAMESPACE::internal::DisableIf \
+        <RAPIDJSON_REMOVEFPTR_(cond), \
+         RAPIDJSON_REMOVEFPTR_(returntype)>::Type
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+//@endcond
+
+#if defined(_MSC_VER) && !defined(__clang__)
+RAPIDJSON_DIAG_POP
+#endif
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_INTERNAL_META_H_
diff --git a/include/rapidjson/internal/pow10.h b/include/rapidjson/internal/pow10.h
new file mode 100644
index 0000000000..eae1a43ed1
--- /dev/null
+++ b/include/rapidjson/internal/pow10.h
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_POW10_
+#define RAPIDJSON_POW10_
+
+#include "../rapidjson.h"
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+//! Computes integer powers of 10 in double (10.0^n).
+/*! This function uses lookup table for fast and accurate results.
+    \param n non-negative exponent. Must <= 308.
+    \return 10.0^n
+*/
+inline double Pow10(int n) {
+    static const double e[] = { // 1e-0...1e308: 309 * 8 bytes = 2472 bytes
+        1e+0,  
+        1e+1,  1e+2,  1e+3,  1e+4,  1e+5,  1e+6,  1e+7,  1e+8,  1e+9,  1e+10, 1e+11, 1e+12, 1e+13, 1e+14, 1e+15, 1e+16, 1e+17, 1e+18, 1e+19, 1e+20, 
+        1e+21, 1e+22, 1e+23, 1e+24, 1e+25, 1e+26, 1e+27, 1e+28, 1e+29, 1e+30, 1e+31, 1e+32, 1e+33, 1e+34, 1e+35, 1e+36, 1e+37, 1e+38, 1e+39, 1e+40,
+        1e+41, 1e+42, 1e+43, 1e+44, 1e+45, 1e+46, 1e+47, 1e+48, 1e+49, 1e+50, 1e+51, 1e+52, 1e+53, 1e+54, 1e+55, 1e+56, 1e+57, 1e+58, 1e+59, 1e+60,
+        1e+61, 1e+62, 1e+63, 1e+64, 1e+65, 1e+66, 1e+67, 1e+68, 1e+69, 1e+70, 1e+71, 1e+72, 1e+73, 1e+74, 1e+75, 1e+76, 1e+77, 1e+78, 1e+79, 1e+80,
+        1e+81, 1e+82, 1e+83, 1e+84, 1e+85, 1e+86, 1e+87, 1e+88, 1e+89, 1e+90, 1e+91, 1e+92, 1e+93, 1e+94, 1e+95, 1e+96, 1e+97, 1e+98, 1e+99, 1e+100,
+        1e+101,1e+102,1e+103,1e+104,1e+105,1e+106,1e+107,1e+108,1e+109,1e+110,1e+111,1e+112,1e+113,1e+114,1e+115,1e+116,1e+117,1e+118,1e+119,1e+120,
+        1e+121,1e+122,1e+123,1e+124,1e+125,1e+126,1e+127,1e+128,1e+129,1e+130,1e+131,1e+132,1e+133,1e+134,1e+135,1e+136,1e+137,1e+138,1e+139,1e+140,
+        1e+141,1e+142,1e+143,1e+144,1e+145,1e+146,1e+147,1e+148,1e+149,1e+150,1e+151,1e+152,1e+153,1e+154,1e+155,1e+156,1e+157,1e+158,1e+159,1e+160,
+        1e+161,1e+162,1e+163,1e+164,1e+165,1e+166,1e+167,1e+168,1e+169,1e+170,1e+171,1e+172,1e+173,1e+174,1e+175,1e+176,1e+177,1e+178,1e+179,1e+180,
+        1e+181,1e+182,1e+183,1e+184,1e+185,1e+186,1e+187,1e+188,1e+189,1e+190,1e+191,1e+192,1e+193,1e+194,1e+195,1e+196,1e+197,1e+198,1e+199,1e+200,
+        1e+201,1e+202,1e+203,1e+204,1e+205,1e+206,1e+207,1e+208,1e+209,1e+210,1e+211,1e+212,1e+213,1e+214,1e+215,1e+216,1e+217,1e+218,1e+219,1e+220,
+        1e+221,1e+222,1e+223,1e+224,1e+225,1e+226,1e+227,1e+228,1e+229,1e+230,1e+231,1e+232,1e+233,1e+234,1e+235,1e+236,1e+237,1e+238,1e+239,1e+240,
+        1e+241,1e+242,1e+243,1e+244,1e+245,1e+246,1e+247,1e+248,1e+249,1e+250,1e+251,1e+252,1e+253,1e+254,1e+255,1e+256,1e+257,1e+258,1e+259,1e+260,
+        1e+261,1e+262,1e+263,1e+264,1e+265,1e+266,1e+267,1e+268,1e+269,1e+270,1e+271,1e+272,1e+273,1e+274,1e+275,1e+276,1e+277,1e+278,1e+279,1e+280,
+        1e+281,1e+282,1e+283,1e+284,1e+285,1e+286,1e+287,1e+288,1e+289,1e+290,1e+291,1e+292,1e+293,1e+294,1e+295,1e+296,1e+297,1e+298,1e+299,1e+300,
+        1e+301,1e+302,1e+303,1e+304,1e+305,1e+306,1e+307,1e+308
+    };
+    RAPIDJSON_ASSERT(n >= 0 && n <= 308);
+    return e[n];
+}
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_POW10_
diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h
new file mode 100644
index 0000000000..7740dcd527
--- /dev/null
+++ b/include/rapidjson/internal/regex.h
@@ -0,0 +1,739 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_INTERNAL_REGEX_H_
+#define RAPIDJSON_INTERNAL_REGEX_H_
+
+#include "../allocators.h"
+#include "../stream.h"
+#include "stack.h"
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(padded)
+RAPIDJSON_DIAG_OFF(switch-enum)
+#elif defined(_MSC_VER)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated
+#endif
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++)
+#endif
+
+#ifndef RAPIDJSON_REGEX_VERBOSE
+#define RAPIDJSON_REGEX_VERBOSE 0
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+///////////////////////////////////////////////////////////////////////////////
+// DecodedStream
+
+template <typename SourceStream, typename Encoding>
+class DecodedStream {
+public:
+    DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); }
+    unsigned Peek() { return codepoint_; }
+    unsigned Take() {
+        unsigned c = codepoint_;
+        if (c) // No further decoding when '\0'
+            Decode();
+        return c;
+    }
+
+private:
+    void Decode() {
+        if (!Encoding::Decode(ss_, &codepoint_))
+            codepoint_ = 0;
+    }
+
+    SourceStream& ss_;
+    unsigned codepoint_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericRegex
+
+static const SizeType kRegexInvalidState = ~SizeType(0);  //!< Represents an invalid index in GenericRegex::State::out, out1
+static const SizeType kRegexInvalidRange = ~SizeType(0);
+
+template <typename Encoding, typename Allocator>
+class GenericRegexSearch;
+
+//! Regular expression engine with subset of ECMAscript grammar.
+/*!
+    Supported regular expression syntax:
+    - \c ab     Concatenation
+    - \c a|b    Alternation
+    - \c a?     Zero or one
+    - \c a*     Zero or more
+    - \c a+     One or more
+    - \c a{3}   Exactly 3 times
+    - \c a{3,}  At least 3 times
+    - \c a{3,5} 3 to 5 times
+    - \c (ab)   Grouping
+    - \c ^a     At the beginning
+    - \c a$     At the end
+    - \c .      Any character
+    - \c [abc]  Character classes
+    - \c [a-c]  Character class range
+    - \c [a-z0-9_] Character class combination
+    - \c [^abc] Negated character classes
+    - \c [^a-c] Negated character class range
+    - \c [\b]   Backspace (U+0008)
+    - \c \\| \\\\ ...  Escape characters
+    - \c \\f Form feed (U+000C)
+    - \c \\n Line feed (U+000A)
+    - \c \\r Carriage return (U+000D)
+    - \c \\t Tab (U+0009)
+    - \c \\v Vertical tab (U+000B)
+
+    \note This is a Thompson NFA engine, implemented with reference to 
+        Cox, Russ. "Regular Expression Matching Can Be Simple And Fast (but is slow in Java, Perl, PHP, Python, Ruby,...).", 
+        https://swtch.com/~rsc/regexp/regexp1.html 
+*/
+template <typename Encoding, typename Allocator = CrtAllocator>
+class GenericRegex {
+public:
+    typedef Encoding EncodingType;
+    typedef typename Encoding::Ch Ch;
+    template <typename, typename> friend class GenericRegexSearch;
+
+    GenericRegex(const Ch* source, Allocator* allocator = 0) : 
+        ownAllocator_(allocator ? 0 : RAPIDJSON_NEW(Allocator)()), allocator_(allocator ? allocator : ownAllocator_), 
+        states_(allocator_, 256), ranges_(allocator_, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(), 
+        anchorBegin_(), anchorEnd_()
+    {
+        GenericStringStream<Encoding> ss(source);
+        DecodedStream<GenericStringStream<Encoding>, Encoding> ds(ss);
+        Parse(ds);
+    }
+
+    ~GenericRegex()
+    {
+        RAPIDJSON_DELETE(ownAllocator_);
+    }
+
+    bool IsValid() const {
+        return root_ != kRegexInvalidState;
+    }
+
+private:
+    enum Operator {
+        kZeroOrOne,
+        kZeroOrMore,
+        kOneOrMore,
+        kConcatenation,
+        kAlternation,
+        kLeftParenthesis
+    };
+
+    static const unsigned kAnyCharacterClass = 0xFFFFFFFF;   //!< For '.'
+    static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
+    static const unsigned kRangeNegationFlag = 0x80000000;
+
+    struct Range {
+        unsigned start; // 
+        unsigned end;
+        SizeType next;
+    };
+
+    struct State {
+        SizeType out;     //!< Equals to kInvalid for matching state
+        SizeType out1;    //!< Equals to non-kInvalid for split
+        SizeType rangeStart;
+        unsigned codepoint;
+    };
+
+    struct Frag {
+        Frag(SizeType s, SizeType o, SizeType m) : start(s), out(o), minIndex(m) {}
+        SizeType start;
+        SizeType out; //!< link-list of all output states
+        SizeType minIndex;
+    };
+
+    State& GetState(SizeType index) {
+        RAPIDJSON_ASSERT(index < stateCount_);
+        return states_.template Bottom<State>()[index];
+    }
+
+    const State& GetState(SizeType index) const {
+        RAPIDJSON_ASSERT(index < stateCount_);
+        return states_.template Bottom<State>()[index];
+    }
+
+    Range& GetRange(SizeType index) {
+        RAPIDJSON_ASSERT(index < rangeCount_);
+        return ranges_.template Bottom<Range>()[index];
+    }
+
+    const Range& GetRange(SizeType index) const {
+        RAPIDJSON_ASSERT(index < rangeCount_);
+        return ranges_.template Bottom<Range>()[index];
+    }
+
+    template <typename InputStream>
+    void Parse(DecodedStream<InputStream, Encoding>& ds) {
+        Stack<Allocator> operandStack(allocator_, 256);    // Frag
+        Stack<Allocator> operatorStack(allocator_, 256);   // Operator
+        Stack<Allocator> atomCountStack(allocator_, 256);  // unsigned (Atom per parenthesis)
+
+        *atomCountStack.template Push<unsigned>() = 0;
+
+        unsigned codepoint;
+        while (ds.Peek() != 0) {
+            switch (codepoint = ds.Take()) {
+                case '^':
+                    anchorBegin_ = true;
+                    break;
+
+                case '$':
+                    anchorEnd_ = true;
+                    break;
+
+                case '|':
+                    while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
+                        if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
+                            return;
+                    *operatorStack.template Push<Operator>() = kAlternation;
+                    *atomCountStack.template Top<unsigned>() = 0;
+                    break;
+
+                case '(':
+                    *operatorStack.template Push<Operator>() = kLeftParenthesis;
+                    *atomCountStack.template Push<unsigned>() = 0;
+                    break;
+
+                case ')':
+                    while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
+                        if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
+                            return;
+                    if (operatorStack.Empty())
+                        return;
+                    operatorStack.template Pop<Operator>(1);
+                    atomCountStack.template Pop<unsigned>(1);
+                    ImplicitConcatenation(atomCountStack, operatorStack);
+                    break;
+
+                case '?':
+                    if (!Eval(operandStack, kZeroOrOne))
+                        return;
+                    break;
+
+                case '*':
+                    if (!Eval(operandStack, kZeroOrMore))
+                        return;
+                    break;
+
+                case '+':
+                    if (!Eval(operandStack, kOneOrMore))
+                        return;
+                    break;
+
+                case '{':
+                    {
+                        unsigned n, m;
+                        if (!ParseUnsigned(ds, &n))
+                            return;
+
+                        if (ds.Peek() == ',') {
+                            ds.Take();
+                            if (ds.Peek() == '}')
+                                m = kInfinityQuantifier;
+                            else if (!ParseUnsigned(ds, &m) || m < n)
+                                return;
+                        }
+                        else
+                            m = n;
+
+                        if (!EvalQuantifier(operandStack, n, m) || ds.Peek() != '}')
+                            return;
+                        ds.Take();
+                    }
+                    break;
+
+                case '.':
+                    PushOperand(operandStack, kAnyCharacterClass);
+                    ImplicitConcatenation(atomCountStack, operatorStack);
+                    break;
+
+                case '[':
+                    {
+                        SizeType range;
+                        if (!ParseRange(ds, &range))
+                            return;
+                        SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
+                        GetState(s).rangeStart = range;
+                        *operandStack.template Push<Frag>() = Frag(s, s, s);
+                    }
+                    ImplicitConcatenation(atomCountStack, operatorStack);
+                    break;
+
+                case '\\': // Escape character
+                    if (!CharacterEscape(ds, &codepoint))
+                        return; // Unsupported escape character
+                    // fall through to default
+                    RAPIDJSON_DELIBERATE_FALLTHROUGH;
+
+                default: // Pattern character
+                    PushOperand(operandStack, codepoint);
+                    ImplicitConcatenation(atomCountStack, operatorStack);
+            }
+        }
+
+        while (!operatorStack.Empty())
+            if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
+                return;
+
+        // Link the operand to matching state.
+        if (operandStack.GetSize() == sizeof(Frag)) {
+            Frag* e = operandStack.template Pop<Frag>(1);
+            Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
+            root_ = e->start;
+
+#if RAPIDJSON_REGEX_VERBOSE
+            printf("root: %d\n", root_);
+            for (SizeType i = 0; i < stateCount_ ; i++) {
+                State& s = GetState(i);
+                printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint);
+            }
+            printf("\n");
+#endif
+        }
+    }
+
+    SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) {
+        State* s = states_.template Push<State>();
+        s->out = out;
+        s->out1 = out1;
+        s->codepoint = codepoint;
+        s->rangeStart = kRegexInvalidRange;
+        return stateCount_++;
+    }
+
+    void PushOperand(Stack<Allocator>& operandStack, unsigned codepoint) {
+        SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
+        *operandStack.template Push<Frag>() = Frag(s, s, s);
+    }
+
+    void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) {
+        if (*atomCountStack.template Top<unsigned>())
+            *operatorStack.template Push<Operator>() = kConcatenation;
+        (*atomCountStack.template Top<unsigned>())++;
+    }
+
+    SizeType Append(SizeType l1, SizeType l2) {
+        SizeType old = l1;
+        while (GetState(l1).out != kRegexInvalidState)
+            l1 = GetState(l1).out;
+        GetState(l1).out = l2;
+        return old;
+    }
+
+    void Patch(SizeType l, SizeType s) {
+        for (SizeType next; l != kRegexInvalidState; l = next) {
+            next = GetState(l).out;
+            GetState(l).out = s;
+        }
+    }
+
+    bool Eval(Stack<Allocator>& operandStack, Operator op) {
+        switch (op) {
+            case kConcatenation:
+                RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag) * 2);
+                {
+                    Frag e2 = *operandStack.template Pop<Frag>(1);
+                    Frag e1 = *operandStack.template Pop<Frag>(1);
+                    Patch(e1.out, e2.start);
+                    *operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
+                }
+                return true;
+
+            case kAlternation:
+                if (operandStack.GetSize() >= sizeof(Frag) * 2) {
+                    Frag e2 = *operandStack.template Pop<Frag>(1);
+                    Frag e1 = *operandStack.template Pop<Frag>(1);
+                    SizeType s = NewState(e1.start, e2.start, 0);
+                    *operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
+                    return true;
+                }
+                return false;
+
+            case kZeroOrOne:
+                if (operandStack.GetSize() >= sizeof(Frag)) {
+                    Frag e = *operandStack.template Pop<Frag>(1);
+                    SizeType s = NewState(kRegexInvalidState, e.start, 0);
+                    *operandStack.template Push<Frag>() = Frag(s, Append(e.out, s), e.minIndex);
+                    return true;
+                }
+                return false;
+
+            case kZeroOrMore:
+                if (operandStack.GetSize() >= sizeof(Frag)) {
+                    Frag e = *operandStack.template Pop<Frag>(1);
+                    SizeType s = NewState(kRegexInvalidState, e.start, 0);
+                    Patch(e.out, s);
+                    *operandStack.template Push<Frag>() = Frag(s, s, e.minIndex);
+                    return true;
+                }
+                return false;
+
+            case kOneOrMore:
+                if (operandStack.GetSize() >= sizeof(Frag)) {
+                    Frag e = *operandStack.template Pop<Frag>(1);
+                    SizeType s = NewState(kRegexInvalidState, e.start, 0);
+                    Patch(e.out, s);
+                    *operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex);
+                    return true;
+                }
+                return false;
+
+            default: 
+                // syntax error (e.g. unclosed kLeftParenthesis)
+                return false;
+        }
+    }
+
+    bool EvalQuantifier(Stack<Allocator>& operandStack, unsigned n, unsigned m) {
+        RAPIDJSON_ASSERT(n <= m);
+        RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag));
+
+        if (n == 0) {
+            if (m == 0)                             // a{0} not support
+                return false;
+            else if (m == kInfinityQuantifier)
+                Eval(operandStack, kZeroOrMore);    // a{0,} -> a*
+            else {
+                Eval(operandStack, kZeroOrOne);         // a{0,5} -> a?
+                for (unsigned i = 0; i < m - 1; i++)
+                    CloneTopOperand(operandStack);      // a{0,5} -> a? a? a? a? a?
+                for (unsigned i = 0; i < m - 1; i++)
+                    Eval(operandStack, kConcatenation); // a{0,5} -> a?a?a?a?a?
+            }
+            return true;
+        }
+
+        for (unsigned i = 0; i < n - 1; i++)        // a{3} -> a a a
+            CloneTopOperand(operandStack);
+
+        if (m == kInfinityQuantifier)
+            Eval(operandStack, kOneOrMore);         // a{3,} -> a a a+
+        else if (m > n) {
+            CloneTopOperand(operandStack);          // a{3,5} -> a a a a
+            Eval(operandStack, kZeroOrOne);         // a{3,5} -> a a a a?
+            for (unsigned i = n; i < m - 1; i++)
+                CloneTopOperand(operandStack);      // a{3,5} -> a a a a? a?
+            for (unsigned i = n; i < m; i++)
+                Eval(operandStack, kConcatenation); // a{3,5} -> a a aa?a?
+        }
+
+        for (unsigned i = 0; i < n - 1; i++)
+            Eval(operandStack, kConcatenation);     // a{3} -> aaa, a{3,} -> aaa+, a{3.5} -> aaaa?a?
+
+        return true;
+    }
+
+    static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; }
+
+    void CloneTopOperand(Stack<Allocator>& operandStack) {
+        const Frag src = *operandStack.template Top<Frag>(); // Copy constructor to prevent invalidation
+        SizeType count = stateCount_ - src.minIndex; // Assumes top operand contains states in [src->minIndex, stateCount_)
+        State* s = states_.template Push<State>(count);
+        memcpy(s, &GetState(src.minIndex), count * sizeof(State));
+        for (SizeType j = 0; j < count; j++) {
+            if (s[j].out != kRegexInvalidState)
+                s[j].out += count;
+            if (s[j].out1 != kRegexInvalidState)
+                s[j].out1 += count;
+        }
+        *operandStack.template Push<Frag>() = Frag(src.start + count, src.out + count, src.minIndex + count);
+        stateCount_ += count;
+    }
+
+    template <typename InputStream>
+    bool ParseUnsigned(DecodedStream<InputStream, Encoding>& ds, unsigned* u) {
+        unsigned r = 0;
+        if (ds.Peek() < '0' || ds.Peek() > '9')
+            return false;
+        while (ds.Peek() >= '0' && ds.Peek() <= '9') {
+            if (r >= 429496729 && ds.Peek() > '5') // 2^32 - 1 = 4294967295
+                return false; // overflow
+            r = r * 10 + (ds.Take() - '0');
+        }
+        *u = r;
+        return true;
+    }
+
+    template <typename InputStream>
+    bool ParseRange(DecodedStream<InputStream, Encoding>& ds, SizeType* range) {
+        bool isBegin = true;
+        bool negate = false;
+        int step = 0;
+        SizeType start = kRegexInvalidRange;
+        SizeType current = kRegexInvalidRange;
+        unsigned codepoint;
+        while ((codepoint = ds.Take()) != 0) {
+            if (isBegin) {
+                isBegin = false;
+                if (codepoint == '^') {
+                    negate = true;
+                    continue;
+                }
+            }
+
+            switch (codepoint) {
+            case ']':
+                if (start == kRegexInvalidRange)
+                    return false;   // Error: nothing inside []
+                if (step == 2) { // Add trailing '-'
+                    SizeType r = NewRange('-');
+                    RAPIDJSON_ASSERT(current != kRegexInvalidRange);
+                    GetRange(current).next = r;
+                }
+                if (negate)
+                    GetRange(start).start |= kRangeNegationFlag;
+                *range = start;
+                return true;
+
+            case '\\':
+                if (ds.Peek() == 'b') {
+                    ds.Take();
+                    codepoint = 0x0008; // Escape backspace character
+                }
+                else if (!CharacterEscape(ds, &codepoint))
+                    return false;
+                // fall through to default
+                RAPIDJSON_DELIBERATE_FALLTHROUGH;
+
+            default:
+                switch (step) {
+                case 1:
+                    if (codepoint == '-') {
+                        step++;
+                        break;
+                    }
+                    // fall through to step 0 for other characters
+                    RAPIDJSON_DELIBERATE_FALLTHROUGH;
+
+                case 0:
+                    {
+                        SizeType r = NewRange(codepoint);
+                        if (current != kRegexInvalidRange)
+                            GetRange(current).next = r;
+                        if (start == kRegexInvalidRange)
+                            start = r;
+                        current = r;
+                    }
+                    step = 1;
+                    break;
+
+                default:
+                    RAPIDJSON_ASSERT(step == 2);
+                    GetRange(current).end = codepoint;
+                    step = 0;
+                }
+            }
+        }
+        return false;
+    }
+    
+    SizeType NewRange(unsigned codepoint) {
+        Range* r = ranges_.template Push<Range>();
+        r->start = r->end = codepoint;
+        r->next = kRegexInvalidRange;
+        return rangeCount_++;
+    }
+
+    template <typename InputStream>
+    bool CharacterEscape(DecodedStream<InputStream, Encoding>& ds, unsigned* escapedCodepoint) {
+        unsigned codepoint;
+        switch (codepoint = ds.Take()) {
+            case '^':
+            case '$':
+            case '|':
+            case '(':
+            case ')':
+            case '?':
+            case '*':
+            case '+':
+            case '.':
+            case '[':
+            case ']':
+            case '{':
+            case '}':
+            case '\\':
+                *escapedCodepoint = codepoint; return true;
+            case 'f': *escapedCodepoint = 0x000C; return true;
+            case 'n': *escapedCodepoint = 0x000A; return true;
+            case 'r': *escapedCodepoint = 0x000D; return true;
+            case 't': *escapedCodepoint = 0x0009; return true;
+            case 'v': *escapedCodepoint = 0x000B; return true;
+            default:
+                return false; // Unsupported escape character
+        }
+    }
+
+    Allocator* ownAllocator_;
+    Allocator* allocator_;
+    Stack<Allocator> states_;
+    Stack<Allocator> ranges_;
+    SizeType root_;
+    SizeType stateCount_;
+    SizeType rangeCount_;
+
+    static const unsigned kInfinityQuantifier = ~0u;
+
+    // For SearchWithAnchoring()
+    bool anchorBegin_;
+    bool anchorEnd_;
+};
+
+template <typename RegexType, typename Allocator = CrtAllocator>
+class GenericRegexSearch {
+public:
+    typedef typename RegexType::EncodingType Encoding;
+    typedef typename Encoding::Ch Ch;
+
+    GenericRegexSearch(const RegexType& regex, Allocator* allocator = 0) : 
+        regex_(regex), allocator_(allocator), ownAllocator_(0),
+        state0_(allocator, 0), state1_(allocator, 0), stateSet_()
+    {
+        RAPIDJSON_ASSERT(regex_.IsValid());
+        if (!allocator_)
+            ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)();
+        stateSet_ = static_cast<uint32_t*>(allocator_->Malloc(GetStateSetSize()));
+        state0_.template Reserve<SizeType>(regex_.stateCount_);
+        state1_.template Reserve<SizeType>(regex_.stateCount_);
+    }
+
+    ~GenericRegexSearch() {
+        Allocator::Free(stateSet_);
+        RAPIDJSON_DELETE(ownAllocator_);
+    }
+
+    template <typename InputStream>
+    bool Match(InputStream& is) {
+        return SearchWithAnchoring(is, true, true);
+    }
+
+    bool Match(const Ch* s) {
+        GenericStringStream<Encoding> is(s);
+        return Match(is);
+    }
+
+    template <typename InputStream>
+    bool Search(InputStream& is) {
+        return SearchWithAnchoring(is, regex_.anchorBegin_, regex_.anchorEnd_);
+    }
+
+    bool Search(const Ch* s) {
+        GenericStringStream<Encoding> is(s);
+        return Search(is);
+    }
+
+private:
+    typedef typename RegexType::State State;
+    typedef typename RegexType::Range Range;
+
+    template <typename InputStream>
+    bool SearchWithAnchoring(InputStream& is, bool anchorBegin, bool anchorEnd) {
+        DecodedStream<InputStream, Encoding> ds(is);
+
+        state0_.Clear();
+        Stack<Allocator> *current = &state0_, *next = &state1_;
+        const size_t stateSetSize = GetStateSetSize();
+        std::memset(stateSet_, 0, stateSetSize);
+
+        bool matched = AddState(*current, regex_.root_);
+        unsigned codepoint;
+        while (!current->Empty() && (codepoint = ds.Take()) != 0) {
+            std::memset(stateSet_, 0, stateSetSize);
+            next->Clear();
+            matched = false;
+            for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
+                const State& sr = regex_.GetState(*s);
+                if (sr.codepoint == codepoint ||
+                    sr.codepoint == RegexType::kAnyCharacterClass || 
+                    (sr.codepoint == RegexType::kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
+                {
+                    matched = AddState(*next, sr.out) || matched;
+                    if (!anchorEnd && matched)
+                        return true;
+                }
+                if (!anchorBegin)
+                    AddState(*next, regex_.root_);
+            }
+            internal::Swap(current, next);
+        }
+
+        return matched;
+    }
+
+    size_t GetStateSetSize() const {
+        return (regex_.stateCount_ + 31) / 32 * 4;
+    }
+
+    // Return whether the added states is a match state
+    bool AddState(Stack<Allocator>& l, SizeType index) {
+        RAPIDJSON_ASSERT(index != kRegexInvalidState);
+
+        const State& s = regex_.GetState(index);
+        if (s.out1 != kRegexInvalidState) { // Split
+            bool matched = AddState(l, s.out);
+            return AddState(l, s.out1) || matched;
+        }
+        else if (!(stateSet_[index >> 5] & (1u << (index & 31)))) {
+            stateSet_[index >> 5] |= (1u << (index & 31));
+            *l.template PushUnsafe<SizeType>() = index;
+        }
+        return s.out == kRegexInvalidState; // by using PushUnsafe() above, we can ensure s is not validated due to reallocation.
+    }
+
+    bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
+        bool yes = (regex_.GetRange(rangeIndex).start & RegexType::kRangeNegationFlag) == 0;
+        while (rangeIndex != kRegexInvalidRange) {
+            const Range& r = regex_.GetRange(rangeIndex);
+            if (codepoint >= (r.start & ~RegexType::kRangeNegationFlag) && codepoint <= r.end)
+                return yes;
+            rangeIndex = r.next;
+        }
+        return !yes;
+    }
+
+    const RegexType& regex_;
+    Allocator* allocator_;
+    Allocator* ownAllocator_;
+    Stack<Allocator> state0_;
+    Stack<Allocator> state1_;
+    uint32_t* stateSet_;
+};
+
+typedef GenericRegex<UTF8<> > Regex;
+typedef GenericRegexSearch<Regex> RegexSearch;
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_POP
+#endif
+
+#if defined(__clang__) || defined(_MSC_VER)
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_INTERNAL_REGEX_H_
diff --git a/include/rapidjson/internal/stack.h b/include/rapidjson/internal/stack.h
new file mode 100644
index 0000000000..73abd706e9
--- /dev/null
+++ b/include/rapidjson/internal/stack.h
@@ -0,0 +1,232 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_INTERNAL_STACK_H_
+#define RAPIDJSON_INTERNAL_STACK_H_
+
+#include "../allocators.h"
+#include "swap.h"
+#include <cstddef>
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(c++98-compat)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+///////////////////////////////////////////////////////////////////////////////
+// Stack
+
+//! A type-unsafe stack for storing different types of data.
+/*! \tparam Allocator Allocator for allocating stack memory.
+*/
+template <typename Allocator>
+class Stack {
+public:
+    // Optimization note: Do not allocate memory for stack_ in constructor.
+    // Do it lazily when first Push() -> Expand() -> Resize().
+    Stack(Allocator* allocator, size_t stackCapacity) : allocator_(allocator), ownAllocator_(0), stack_(0), stackTop_(0), stackEnd_(0), initialCapacity_(stackCapacity) {
+    }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    Stack(Stack&& rhs)
+        : allocator_(rhs.allocator_),
+          ownAllocator_(rhs.ownAllocator_),
+          stack_(rhs.stack_),
+          stackTop_(rhs.stackTop_),
+          stackEnd_(rhs.stackEnd_),
+          initialCapacity_(rhs.initialCapacity_)
+    {
+        rhs.allocator_ = 0;
+        rhs.ownAllocator_ = 0;
+        rhs.stack_ = 0;
+        rhs.stackTop_ = 0;
+        rhs.stackEnd_ = 0;
+        rhs.initialCapacity_ = 0;
+    }
+#endif
+
+    ~Stack() {
+        Destroy();
+    }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    Stack& operator=(Stack&& rhs) {
+        if (&rhs != this)
+        {
+            Destroy();
+
+            allocator_ = rhs.allocator_;
+            ownAllocator_ = rhs.ownAllocator_;
+            stack_ = rhs.stack_;
+            stackTop_ = rhs.stackTop_;
+            stackEnd_ = rhs.stackEnd_;
+            initialCapacity_ = rhs.initialCapacity_;
+
+            rhs.allocator_ = 0;
+            rhs.ownAllocator_ = 0;
+            rhs.stack_ = 0;
+            rhs.stackTop_ = 0;
+            rhs.stackEnd_ = 0;
+            rhs.initialCapacity_ = 0;
+        }
+        return *this;
+    }
+#endif
+
+    void Swap(Stack& rhs) RAPIDJSON_NOEXCEPT {
+        internal::Swap(allocator_, rhs.allocator_);
+        internal::Swap(ownAllocator_, rhs.ownAllocator_);
+        internal::Swap(stack_, rhs.stack_);
+        internal::Swap(stackTop_, rhs.stackTop_);
+        internal::Swap(stackEnd_, rhs.stackEnd_);
+        internal::Swap(initialCapacity_, rhs.initialCapacity_);
+    }
+
+    void Clear() { stackTop_ = stack_; }
+
+    void ShrinkToFit() { 
+        if (Empty()) {
+            // If the stack is empty, completely deallocate the memory.
+            Allocator::Free(stack_); // NOLINT (+clang-analyzer-unix.Malloc)
+            stack_ = 0;
+            stackTop_ = 0;
+            stackEnd_ = 0;
+        }
+        else
+            Resize(GetSize());
+    }
+
+    // Optimization note: try to minimize the size of this function for force inline.
+    // Expansion is run very infrequently, so it is moved to another (probably non-inline) function.
+    template<typename T>
+    RAPIDJSON_FORCEINLINE void Reserve(size_t count = 1) {
+         // Expand the stack if needed
+        if (RAPIDJSON_UNLIKELY(static_cast<std::ptrdiff_t>(sizeof(T) * count) > (stackEnd_ - stackTop_)))
+            Expand<T>(count);
+    }
+
+    template<typename T>
+    RAPIDJSON_FORCEINLINE T* Push(size_t count = 1) {
+        Reserve<T>(count);
+        return PushUnsafe<T>(count);
+    }
+
+    template<typename T>
+    RAPIDJSON_FORCEINLINE T* PushUnsafe(size_t count = 1) {
+        RAPIDJSON_ASSERT(stackTop_);
+        RAPIDJSON_ASSERT(static_cast<std::ptrdiff_t>(sizeof(T) * count) <= (stackEnd_ - stackTop_));
+        T* ret = reinterpret_cast<T*>(stackTop_);
+        stackTop_ += sizeof(T) * count;
+        return ret;
+    }
+
+    template<typename T>
+    T* Pop(size_t count) {
+        RAPIDJSON_ASSERT(GetSize() >= count * sizeof(T));
+        stackTop_ -= count * sizeof(T);
+        return reinterpret_cast<T*>(stackTop_);
+    }
+
+    template<typename T>
+    T* Top() { 
+        RAPIDJSON_ASSERT(GetSize() >= sizeof(T));
+        return reinterpret_cast<T*>(stackTop_ - sizeof(T));
+    }
+
+    template<typename T>
+    const T* Top() const {
+        RAPIDJSON_ASSERT(GetSize() >= sizeof(T));
+        return reinterpret_cast<T*>(stackTop_ - sizeof(T));
+    }
+
+    template<typename T>
+    T* End() { return reinterpret_cast<T*>(stackTop_); }
+
+    template<typename T>
+    const T* End() const { return reinterpret_cast<T*>(stackTop_); }
+
+    template<typename T>
+    T* Bottom() { return reinterpret_cast<T*>(stack_); }
+
+    template<typename T>
+    const T* Bottom() const { return reinterpret_cast<T*>(stack_); }
+
+    bool HasAllocator() const {
+        return allocator_ != 0;
+    }
+
+    Allocator& GetAllocator() {
+        RAPIDJSON_ASSERT(allocator_);
+        return *allocator_;
+    }
+
+    bool Empty() const { return stackTop_ == stack_; }
+    size_t GetSize() const { return static_cast<size_t>(stackTop_ - stack_); }
+    size_t GetCapacity() const { return static_cast<size_t>(stackEnd_ - stack_); }
+
+private:
+    template<typename T>
+    void Expand(size_t count) {
+        // Only expand the capacity if the current stack exists. Otherwise just create a stack with initial capacity.
+        size_t newCapacity;
+        if (stack_ == 0) {
+            if (!allocator_)
+                ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)();
+            newCapacity = initialCapacity_;
+        } else {
+            newCapacity = GetCapacity();
+            newCapacity += (newCapacity + 1) / 2;
+        }
+        size_t newSize = GetSize() + sizeof(T) * count;
+        if (newCapacity < newSize)
+            newCapacity = newSize;
+
+        Resize(newCapacity);
+    }
+
+    void Resize(size_t newCapacity) {
+        const size_t size = GetSize();  // Backup the current size
+        stack_ = static_cast<char*>(allocator_->Realloc(stack_, GetCapacity(), newCapacity));
+        stackTop_ = stack_ + size;
+        stackEnd_ = stack_ + newCapacity;
+    }
+
+    void Destroy() {
+        Allocator::Free(stack_);
+        RAPIDJSON_DELETE(ownAllocator_); // Only delete if it is owned by the stack
+    }
+
+    // Prohibit copy constructor & assignment operator.
+    Stack(const Stack&);
+    Stack& operator=(const Stack&);
+
+    Allocator* allocator_;
+    Allocator* ownAllocator_;
+    char *stack_;
+    char *stackTop_;
+    char *stackEnd_;
+    size_t initialCapacity_;
+};
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_STACK_H_
diff --git a/include/rapidjson/internal/strfunc.h b/include/rapidjson/internal/strfunc.h
new file mode 100644
index 0000000000..b698a8f43f
--- /dev/null
+++ b/include/rapidjson/internal/strfunc.h
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_INTERNAL_STRFUNC_H_
+#define RAPIDJSON_INTERNAL_STRFUNC_H_
+
+#include "../stream.h"
+#include <cwchar>
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+//! Custom strlen() which works on different character types.
+/*! \tparam Ch Character type (e.g. char, wchar_t, short)
+    \param s Null-terminated input string.
+    \return Number of characters in the string. 
+    \note This has the same semantics as strlen(), the return value is not number of Unicode codepoints.
+*/
+template <typename Ch>
+inline SizeType StrLen(const Ch* s) {
+    RAPIDJSON_ASSERT(s != 0);
+    const Ch* p = s;
+    while (*p) ++p;
+    return SizeType(p - s);
+}
+
+template <>
+inline SizeType StrLen(const char* s) {
+    return SizeType(std::strlen(s));
+}
+
+template <>
+inline SizeType StrLen(const wchar_t* s) {
+    return SizeType(std::wcslen(s));
+}
+
+//! Custom strcmpn() which works on different character types.
+/*! \tparam Ch Character type (e.g. char, wchar_t, short)
+    \param s1 Null-terminated input string.
+    \param s2 Null-terminated input string.
+    \return 0 if equal
+*/
+template<typename Ch>
+inline int StrCmp(const Ch* s1, const Ch* s2) {
+    RAPIDJSON_ASSERT(s1 != 0);
+    RAPIDJSON_ASSERT(s2 != 0);
+    while(*s1 && (*s1 == *s2)) { s1++; s2++; }
+    return static_cast<unsigned>(*s1) < static_cast<unsigned>(*s2) ? -1 : static_cast<unsigned>(*s1) > static_cast<unsigned>(*s2);
+}
+
+//! Returns number of code points in a encoded string.
+template<typename Encoding>
+bool CountStringCodePoint(const typename Encoding::Ch* s, SizeType length, SizeType* outCount) {
+    RAPIDJSON_ASSERT(s != 0);
+    RAPIDJSON_ASSERT(outCount != 0);
+    GenericStringStream<Encoding> is(s);
+    const typename Encoding::Ch* end = s + length;
+    SizeType count = 0;
+    while (is.src_ < end) {
+        unsigned codepoint;
+        if (!Encoding::Decode(is, &codepoint))
+            return false;
+        count++;
+    }
+    *outCount = count;
+    return true;
+}
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_INTERNAL_STRFUNC_H_
diff --git a/include/rapidjson/internal/strtod.h b/include/rapidjson/internal/strtod.h
new file mode 100644
index 0000000000..57c8418bd9
--- /dev/null
+++ b/include/rapidjson/internal/strtod.h
@@ -0,0 +1,293 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_STRTOD_
+#define RAPIDJSON_STRTOD_
+
+#include "ieee754.h"
+#include "biginteger.h"
+#include "diyfp.h"
+#include "pow10.h"
+#include <climits>
+#include <limits>
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+inline double FastPath(double significand, int exp) {
+    if (exp < -308)
+        return 0.0;
+    else if (exp >= 0)
+        return significand * internal::Pow10(exp);
+    else
+        return significand / internal::Pow10(-exp);
+}
+
+inline double StrtodNormalPrecision(double d, int p) {
+    if (p < -308) {
+        // Prevent expSum < -308, making Pow10(p) = 0
+        d = FastPath(d, -308);
+        d = FastPath(d, p + 308);
+    }
+    else
+        d = FastPath(d, p);
+    return d;
+}
+
+template <typename T>
+inline T Min3(T a, T b, T c) {
+    T m = a;
+    if (m > b) m = b;
+    if (m > c) m = c;
+    return m;
+}
+
+inline int CheckWithinHalfULP(double b, const BigInteger& d, int dExp) {
+    const Double db(b);
+    const uint64_t bInt = db.IntegerSignificand();
+    const int bExp = db.IntegerExponent();
+    const int hExp = bExp - 1;
+
+    int dS_Exp2 = 0, dS_Exp5 = 0, bS_Exp2 = 0, bS_Exp5 = 0, hS_Exp2 = 0, hS_Exp5 = 0;
+
+    // Adjust for decimal exponent
+    if (dExp >= 0) {
+        dS_Exp2 += dExp;
+        dS_Exp5 += dExp;
+    }
+    else {
+        bS_Exp2 -= dExp;
+        bS_Exp5 -= dExp;
+        hS_Exp2 -= dExp;
+        hS_Exp5 -= dExp;
+    }
+
+    // Adjust for binary exponent
+    if (bExp >= 0)
+        bS_Exp2 += bExp;
+    else {
+        dS_Exp2 -= bExp;
+        hS_Exp2 -= bExp;
+    }
+
+    // Adjust for half ulp exponent
+    if (hExp >= 0)
+        hS_Exp2 += hExp;
+    else {
+        dS_Exp2 -= hExp;
+        bS_Exp2 -= hExp;
+    }
+
+    // Remove common power of two factor from all three scaled values
+    int common_Exp2 = Min3(dS_Exp2, bS_Exp2, hS_Exp2);
+    dS_Exp2 -= common_Exp2;
+    bS_Exp2 -= common_Exp2;
+    hS_Exp2 -= common_Exp2;
+
+    BigInteger dS = d;
+    dS.MultiplyPow5(static_cast<unsigned>(dS_Exp5)) <<= static_cast<unsigned>(dS_Exp2);
+
+    BigInteger bS(bInt);
+    bS.MultiplyPow5(static_cast<unsigned>(bS_Exp5)) <<= static_cast<unsigned>(bS_Exp2);
+
+    BigInteger hS(1);
+    hS.MultiplyPow5(static_cast<unsigned>(hS_Exp5)) <<= static_cast<unsigned>(hS_Exp2);
+
+    BigInteger delta(0);
+    dS.Difference(bS, &delta);
+
+    return delta.Compare(hS);
+}
+
+inline bool StrtodFast(double d, int p, double* result) {
+    // Use fast path for string-to-double conversion if possible
+    // see http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+    if (p > 22  && p < 22 + 16) {
+        // Fast Path Cases In Disguise
+        d *= internal::Pow10(p - 22);
+        p = 22;
+    }
+
+    if (p >= -22 && p <= 22 && d <= 9007199254740991.0) { // 2^53 - 1
+        *result = FastPath(d, p);
+        return true;
+    }
+    else
+        return false;
+}
+
+// Compute an approximation and see if it is within 1/2 ULP
+template<typename Ch>
+inline bool StrtodDiyFp(const Ch* decimals, int dLen, int dExp, double* result) {
+    uint64_t significand = 0;
+    int i = 0;   // 2^64 - 1 = 18446744073709551615, 1844674407370955161 = 0x1999999999999999    
+    for (; i < dLen; i++) {
+        if (significand  >  RAPIDJSON_UINT64_C2(0x19999999, 0x99999999) ||
+            (significand == RAPIDJSON_UINT64_C2(0x19999999, 0x99999999) && decimals[i] >= Ch('5')))
+            break;
+        significand = significand * 10u + static_cast<unsigned>(decimals[i] - Ch('0'));
+    }
+    
+    if (i < dLen && decimals[i] >= Ch('5')) // Rounding
+        significand++;
+
+    int remaining = dLen - i;
+    const int kUlpShift = 3;
+    const int kUlp = 1 << kUlpShift;
+    int64_t error = (remaining == 0) ? 0 : kUlp / 2;
+
+    DiyFp v(significand, 0);
+    v = v.Normalize();
+    error <<= -v.e;
+
+    dExp += remaining;
+
+    int actualExp;
+    DiyFp cachedPower = GetCachedPower10(dExp, &actualExp);
+    if (actualExp != dExp) {
+        static const DiyFp kPow10[] = {
+            DiyFp(RAPIDJSON_UINT64_C2(0xa0000000, 0x00000000), -60),  // 10^1
+            DiyFp(RAPIDJSON_UINT64_C2(0xc8000000, 0x00000000), -57),  // 10^2
+            DiyFp(RAPIDJSON_UINT64_C2(0xfa000000, 0x00000000), -54),  // 10^3
+            DiyFp(RAPIDJSON_UINT64_C2(0x9c400000, 0x00000000), -50),  // 10^4
+            DiyFp(RAPIDJSON_UINT64_C2(0xc3500000, 0x00000000), -47),  // 10^5
+            DiyFp(RAPIDJSON_UINT64_C2(0xf4240000, 0x00000000), -44),  // 10^6
+            DiyFp(RAPIDJSON_UINT64_C2(0x98968000, 0x00000000), -40)   // 10^7
+        };
+        int adjustment = dExp - actualExp;
+        RAPIDJSON_ASSERT(adjustment >= 1 && adjustment < 8);
+        v = v * kPow10[adjustment - 1];
+        if (dLen + adjustment > 19) // has more digits than decimal digits in 64-bit
+            error += kUlp / 2;
+    }
+
+    v = v * cachedPower;
+
+    error += kUlp + (error == 0 ? 0 : 1);
+
+    const int oldExp = v.e;
+    v = v.Normalize();
+    error <<= oldExp - v.e;
+
+    const int effectiveSignificandSize = Double::EffectiveSignificandSize(64 + v.e);
+    int precisionSize = 64 - effectiveSignificandSize;
+    if (precisionSize + kUlpShift >= 64) {
+        int scaleExp = (precisionSize + kUlpShift) - 63;
+        v.f >>= scaleExp;
+        v.e += scaleExp; 
+        error = (error >> scaleExp) + 1 + kUlp;
+        precisionSize -= scaleExp;
+    }
+
+    DiyFp rounded(v.f >> precisionSize, v.e + precisionSize);
+    const uint64_t precisionBits = (v.f & ((uint64_t(1) << precisionSize) - 1)) * kUlp;
+    const uint64_t halfWay = (uint64_t(1) << (precisionSize - 1)) * kUlp;
+    if (precisionBits >= halfWay + static_cast<unsigned>(error)) {
+        rounded.f++;
+        if (rounded.f & (DiyFp::kDpHiddenBit << 1)) { // rounding overflows mantissa (issue #340)
+            rounded.f >>= 1;
+            rounded.e++;
+        }
+    }
+
+    *result = rounded.ToDouble();
+
+    return halfWay - static_cast<unsigned>(error) >= precisionBits || precisionBits >= halfWay + static_cast<unsigned>(error);
+}
+
+template<typename Ch>
+inline double StrtodBigInteger(double approx, const Ch* decimals, int dLen, int dExp) {
+    RAPIDJSON_ASSERT(dLen >= 0);
+    const BigInteger dInt(decimals, static_cast<unsigned>(dLen));
+    Double a(approx);
+    int cmp = CheckWithinHalfULP(a.Value(), dInt, dExp);
+    if (cmp < 0)
+        return a.Value();  // within half ULP
+    else if (cmp == 0) {
+        // Round towards even
+        if (a.Significand() & 1)
+            return a.NextPositiveDouble();
+        else
+            return a.Value();
+    }
+    else // adjustment
+        return a.NextPositiveDouble();
+}
+
+template<typename Ch>
+inline double StrtodFullPrecision(double d, int p, const Ch* decimals, size_t length, size_t decimalPosition, int exp) {
+    RAPIDJSON_ASSERT(d >= 0.0);
+    RAPIDJSON_ASSERT(length >= 1);
+
+    double result = 0.0;
+    if (StrtodFast(d, p, &result))
+        return result;
+
+    RAPIDJSON_ASSERT(length <= INT_MAX);
+    int dLen = static_cast<int>(length);
+
+    RAPIDJSON_ASSERT(length >= decimalPosition);
+    RAPIDJSON_ASSERT(length - decimalPosition <= INT_MAX);
+    int dExpAdjust = static_cast<int>(length - decimalPosition);
+
+    RAPIDJSON_ASSERT(exp >= INT_MIN + dExpAdjust);
+    int dExp = exp - dExpAdjust;
+
+    // Make sure length+dExp does not overflow
+    RAPIDJSON_ASSERT(dExp <= INT_MAX - dLen);
+
+    // Trim leading zeros
+    while (dLen > 0 && *decimals == '0') {
+        dLen--;
+        decimals++;
+    }
+
+    // Trim trailing zeros
+    while (dLen > 0 && decimals[dLen - 1] == '0') {
+        dLen--;
+        dExp++;
+    }
+
+    if (dLen == 0) { // Buffer only contains zeros.
+        return 0.0;
+    }
+
+    // Trim right-most digits
+    const int kMaxDecimalDigit = 767 + 1;
+    if (dLen > kMaxDecimalDigit) {
+        dExp += dLen - kMaxDecimalDigit;
+        dLen = kMaxDecimalDigit;
+    }
+
+    // If too small, underflow to zero.
+    // Any x <= 10^-324 is interpreted as zero.
+    if (dLen + dExp <= -324)
+        return 0.0;
+
+    // If too large, overflow to infinity.
+    // Any x >= 10^309 is interpreted as +infinity.
+    if (dLen + dExp > 309)
+        return std::numeric_limits<double>::infinity();
+
+    if (StrtodDiyFp(decimals, dLen, dExp, &result))
+        return result;
+
+    // Use approximation from StrtodDiyFp and make adjustment with BigInteger comparison
+    return StrtodBigInteger(result, decimals, dLen, dExp);
+}
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_STRTOD_
diff --git a/include/rapidjson/internal/swap.h b/include/rapidjson/internal/swap.h
new file mode 100644
index 0000000000..2cf92f93a1
--- /dev/null
+++ b/include/rapidjson/internal/swap.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_INTERNAL_SWAP_H_
+#define RAPIDJSON_INTERNAL_SWAP_H_
+
+#include "../rapidjson.h"
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(c++98-compat)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+namespace internal {
+
+//! Custom swap() to avoid dependency on C++ <algorithm> header
+/*! \tparam T Type of the arguments to swap, should be instantiated with primitive C++ types only.
+    \note This has the same semantics as std::swap().
+*/
+template <typename T>
+inline void Swap(T& a, T& b) RAPIDJSON_NOEXCEPT {
+    T tmp = a;
+        a = b;
+        b = tmp;
+}
+
+} // namespace internal
+RAPIDJSON_NAMESPACE_END
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_INTERNAL_SWAP_H_
diff --git a/include/rapidjson/istreamwrapper.h b/include/rapidjson/istreamwrapper.h
new file mode 100644
index 0000000000..01437ec012
--- /dev/null
+++ b/include/rapidjson/istreamwrapper.h
@@ -0,0 +1,128 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_ISTREAMWRAPPER_H_
+#define RAPIDJSON_ISTREAMWRAPPER_H_
+
+#include "stream.h"
+#include <iosfwd>
+#include <ios>
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(padded)
+#elif defined(_MSC_VER)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(4351) // new behavior: elements of array 'array' will be default initialized
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Wrapper of \c std::basic_istream into RapidJSON's Stream concept.
+/*!
+    The classes can be wrapped including but not limited to:
+
+    - \c std::istringstream
+    - \c std::stringstream
+    - \c std::wistringstream
+    - \c std::wstringstream
+    - \c std::ifstream
+    - \c std::fstream
+    - \c std::wifstream
+    - \c std::wfstream
+
+    \tparam StreamType Class derived from \c std::basic_istream.
+*/
+   
+template <typename StreamType>
+class BasicIStreamWrapper {
+public:
+    typedef typename StreamType::char_type Ch;
+
+    //! Constructor.
+    /*!
+        \param stream stream opened for read.
+    */
+    BasicIStreamWrapper(StreamType &stream) : stream_(stream), buffer_(peekBuffer_), bufferSize_(4), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) { 
+        Read();
+    }
+
+    //! Constructor.
+    /*!
+        \param stream stream opened for read.
+        \param buffer user-supplied buffer.
+        \param bufferSize size of buffer in bytes. Must >=4 bytes.
+    */
+    BasicIStreamWrapper(StreamType &stream, char* buffer, size_t bufferSize) : stream_(stream), buffer_(buffer), bufferSize_(bufferSize), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) { 
+        RAPIDJSON_ASSERT(bufferSize >= 4);
+        Read();
+    }
+
+    Ch Peek() const { return *current_; }
+    Ch Take() { Ch c = *current_; Read(); return c; }
+    size_t Tell() const { return count_ + static_cast<size_t>(current_ - buffer_); }
+
+    // Not implemented
+    void Put(Ch) { RAPIDJSON_ASSERT(false); }
+    void Flush() { RAPIDJSON_ASSERT(false); } 
+    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
+
+    // For encoding detection only.
+    const Ch* Peek4() const {
+        return (current_ + 4 - !eof_ <= bufferLast_) ? current_ : 0;
+    }
+
+private:
+    BasicIStreamWrapper();
+    BasicIStreamWrapper(const BasicIStreamWrapper&);
+    BasicIStreamWrapper& operator=(const BasicIStreamWrapper&);
+
+    void Read() {
+        if (current_ < bufferLast_)
+            ++current_;
+        else if (!eof_) {
+            count_ += readCount_;
+            readCount_ = bufferSize_;
+            bufferLast_ = buffer_ + readCount_ - 1;
+            current_ = buffer_;
+
+            if (!stream_.read(buffer_, static_cast<std::streamsize>(bufferSize_))) {
+                readCount_ = static_cast<size_t>(stream_.gcount());
+                *(bufferLast_ = buffer_ + readCount_) = '\0';
+                eof_ = true;
+            }
+        }
+    }
+
+    StreamType &stream_;
+    Ch peekBuffer_[4], *buffer_;
+    size_t bufferSize_;
+    Ch *bufferLast_;
+    Ch *current_;
+    size_t readCount_;
+    size_t count_;  //!< Number of characters read
+    bool eof_;
+};
+
+typedef BasicIStreamWrapper<std::istream> IStreamWrapper;
+typedef BasicIStreamWrapper<std::wistream> WIStreamWrapper;
+
+#if defined(__clang__) || defined(_MSC_VER)
+RAPIDJSON_DIAG_POP
+#endif
+
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_ISTREAMWRAPPER_H_
diff --git a/include/rapidjson/memorybuffer.h b/include/rapidjson/memorybuffer.h
new file mode 100644
index 0000000000..ffbc41ed1f
--- /dev/null
+++ b/include/rapidjson/memorybuffer.h
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_MEMORYBUFFER_H_
+#define RAPIDJSON_MEMORYBUFFER_H_
+
+#include "stream.h"
+#include "internal/stack.h"
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Represents an in-memory output byte stream.
+/*!
+    This class is mainly for being wrapped by EncodedOutputStream or AutoUTFOutputStream.
+
+    It is similar to FileWriteBuffer but the destination is an in-memory buffer instead of a file.
+
+    Differences between MemoryBuffer and StringBuffer:
+    1. StringBuffer has Encoding but MemoryBuffer is only a byte buffer. 
+    2. StringBuffer::GetString() returns a null-terminated string. MemoryBuffer::GetBuffer() returns a buffer without terminator.
+
+    \tparam Allocator type for allocating memory buffer.
+    \note implements Stream concept
+*/
+template <typename Allocator = CrtAllocator>
+struct GenericMemoryBuffer {
+    typedef char Ch; // byte
+
+    GenericMemoryBuffer(Allocator* allocator = 0, size_t capacity = kDefaultCapacity) : stack_(allocator, capacity) {}
+
+    void Put(Ch c) { *stack_.template Push<Ch>() = c; }
+    void Flush() {}
+
+    void Clear() { stack_.Clear(); }
+    void ShrinkToFit() { stack_.ShrinkToFit(); }
+    Ch* Push(size_t count) { return stack_.template Push<Ch>(count); }
+    void Pop(size_t count) { stack_.template Pop<Ch>(count); }
+
+    const Ch* GetBuffer() const {
+        return stack_.template Bottom<Ch>();
+    }
+
+    size_t GetSize() const { return stack_.GetSize(); }
+
+    static const size_t kDefaultCapacity = 256;
+    mutable internal::Stack<Allocator> stack_;
+};
+
+typedef GenericMemoryBuffer<> MemoryBuffer;
+
+//! Implement specialized version of PutN() with memset() for better performance.
+template<>
+inline void PutN(MemoryBuffer& memoryBuffer, char c, size_t n) {
+    std::memset(memoryBuffer.stack_.Push<char>(n), c, n * sizeof(c));
+}
+
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_MEMORYBUFFER_H_
diff --git a/include/rapidjson/memorystream.h b/include/rapidjson/memorystream.h
new file mode 100644
index 0000000000..77af6c999e
--- /dev/null
+++ b/include/rapidjson/memorystream.h
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_MEMORYSTREAM_H_
+#define RAPIDJSON_MEMORYSTREAM_H_
+
+#include "stream.h"
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(unreachable-code)
+RAPIDJSON_DIAG_OFF(missing-noreturn)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Represents an in-memory input byte stream.
+/*!
+    This class is mainly for being wrapped by EncodedInputStream or AutoUTFInputStream.
+
+    It is similar to FileReadBuffer but the source is an in-memory buffer instead of a file.
+
+    Differences between MemoryStream and StringStream:
+    1. StringStream has encoding but MemoryStream is a byte stream.
+    2. MemoryStream needs size of the source buffer and the buffer don't need to be null terminated. StringStream assume null-terminated string as source.
+    3. MemoryStream supports Peek4() for encoding detection. StringStream is specified with an encoding so it should not have Peek4().
+    \note implements Stream concept
+*/
+struct MemoryStream {
+    typedef char Ch; // byte
+
+    MemoryStream(const Ch *src, size_t size) : src_(src), begin_(src), end_(src + size), size_(size) {}
+
+    Ch Peek() const { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_; }
+    Ch Take() { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_++; }
+    size_t Tell() const { return static_cast<size_t>(src_ - begin_); }
+
+    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    void Put(Ch) { RAPIDJSON_ASSERT(false); }
+    void Flush() { RAPIDJSON_ASSERT(false); }
+    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
+
+    // For encoding detection only.
+    const Ch* Peek4() const {
+        return Tell() + 4 <= size_ ? src_ : 0;
+    }
+
+    const Ch* src_;     //!< Current read position.
+    const Ch* begin_;   //!< Original head of the string.
+    const Ch* end_;     //!< End of stream.
+    size_t size_;       //!< Size of the stream.
+};
+
+RAPIDJSON_NAMESPACE_END
+
+#ifdef __clang__
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_MEMORYBUFFER_H_
diff --git a/include/rapidjson/msinttypes/inttypes.h b/include/rapidjson/msinttypes/inttypes.h
new file mode 100644
index 0000000000..18111286bf
--- /dev/null
+++ b/include/rapidjson/msinttypes/inttypes.h
@@ -0,0 +1,316 @@
+// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
+// 
+//  Copyright (c) 2006-2013 Alexander Chemeris
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 
+//   1. Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimer.
+// 
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+// 
+//   3. Neither the name of the product nor the names of its contributors may
+//      be used to endorse or promote products derived from this software
+//      without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+// The above software in this distribution may have been modified by 
+// THL A29 Limited ("Tencent Modifications"). 
+// All Tencent Modifications are Copyright (C) 2015 THL A29 Limited.
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_INTTYPES_H_ // [
+#define _MSC_INTTYPES_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include "stdint.h"
+
+// miloyip: VC supports inttypes.h since VC2013
+#if _MSC_VER >= 1800
+#include <inttypes.h>
+#else
+
+// 7.8 Format conversion of integer types
+
+typedef struct {
+   intmax_t quot;
+   intmax_t rem;
+} imaxdiv_t;
+
+// 7.8.1 Macros for format specifiers
+
+#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
+
+// The fprintf macros for signed integers are:
+#define PRId8       "d"
+#define PRIi8       "i"
+#define PRIdLEAST8  "d"
+#define PRIiLEAST8  "i"
+#define PRIdFAST8   "d"
+#define PRIiFAST8   "i"
+
+#define PRId16       "hd"
+#define PRIi16       "hi"
+#define PRIdLEAST16  "hd"
+#define PRIiLEAST16  "hi"
+#define PRIdFAST16   "hd"
+#define PRIiFAST16   "hi"
+
+#define PRId32       "I32d"
+#define PRIi32       "I32i"
+#define PRIdLEAST32  "I32d"
+#define PRIiLEAST32  "I32i"
+#define PRIdFAST32   "I32d"
+#define PRIiFAST32   "I32i"
+
+#define PRId64       "I64d"
+#define PRIi64       "I64i"
+#define PRIdLEAST64  "I64d"
+#define PRIiLEAST64  "I64i"
+#define PRIdFAST64   "I64d"
+#define PRIiFAST64   "I64i"
+
+#define PRIdMAX     "I64d"
+#define PRIiMAX     "I64i"
+
+#define PRIdPTR     "Id"
+#define PRIiPTR     "Ii"
+
+// The fprintf macros for unsigned integers are:
+#define PRIo8       "o"
+#define PRIu8       "u"
+#define PRIx8       "x"
+#define PRIX8       "X"
+#define PRIoLEAST8  "o"
+#define PRIuLEAST8  "u"
+#define PRIxLEAST8  "x"
+#define PRIXLEAST8  "X"
+#define PRIoFAST8   "o"
+#define PRIuFAST8   "u"
+#define PRIxFAST8   "x"
+#define PRIXFAST8   "X"
+
+#define PRIo16       "ho"
+#define PRIu16       "hu"
+#define PRIx16       "hx"
+#define PRIX16       "hX"
+#define PRIoLEAST16  "ho"
+#define PRIuLEAST16  "hu"
+#define PRIxLEAST16  "hx"
+#define PRIXLEAST16  "hX"
+#define PRIoFAST16   "ho"
+#define PRIuFAST16   "hu"
+#define PRIxFAST16   "hx"
+#define PRIXFAST16   "hX"
+
+#define PRIo32       "I32o"
+#define PRIu32       "I32u"
+#define PRIx32       "I32x"
+#define PRIX32       "I32X"
+#define PRIoLEAST32  "I32o"
+#define PRIuLEAST32  "I32u"
+#define PRIxLEAST32  "I32x"
+#define PRIXLEAST32  "I32X"
+#define PRIoFAST32   "I32o"
+#define PRIuFAST32   "I32u"
+#define PRIxFAST32   "I32x"
+#define PRIXFAST32   "I32X"
+
+#define PRIo64       "I64o"
+#define PRIu64       "I64u"
+#define PRIx64       "I64x"
+#define PRIX64       "I64X"
+#define PRIoLEAST64  "I64o"
+#define PRIuLEAST64  "I64u"
+#define PRIxLEAST64  "I64x"
+#define PRIXLEAST64  "I64X"
+#define PRIoFAST64   "I64o"
+#define PRIuFAST64   "I64u"
+#define PRIxFAST64   "I64x"
+#define PRIXFAST64   "I64X"
+
+#define PRIoMAX     "I64o"
+#define PRIuMAX     "I64u"
+#define PRIxMAX     "I64x"
+#define PRIXMAX     "I64X"
+
+#define PRIoPTR     "Io"
+#define PRIuPTR     "Iu"
+#define PRIxPTR     "Ix"
+#define PRIXPTR     "IX"
+
+// The fscanf macros for signed integers are:
+#define SCNd8       "d"
+#define SCNi8       "i"
+#define SCNdLEAST8  "d"
+#define SCNiLEAST8  "i"
+#define SCNdFAST8   "d"
+#define SCNiFAST8   "i"
+
+#define SCNd16       "hd"
+#define SCNi16       "hi"
+#define SCNdLEAST16  "hd"
+#define SCNiLEAST16  "hi"
+#define SCNdFAST16   "hd"
+#define SCNiFAST16   "hi"
+
+#define SCNd32       "ld"
+#define SCNi32       "li"
+#define SCNdLEAST32  "ld"
+#define SCNiLEAST32  "li"
+#define SCNdFAST32   "ld"
+#define SCNiFAST32   "li"
+
+#define SCNd64       "I64d"
+#define SCNi64       "I64i"
+#define SCNdLEAST64  "I64d"
+#define SCNiLEAST64  "I64i"
+#define SCNdFAST64   "I64d"
+#define SCNiFAST64   "I64i"
+
+#define SCNdMAX     "I64d"
+#define SCNiMAX     "I64i"
+
+#ifdef _WIN64 // [
+#  define SCNdPTR     "I64d"
+#  define SCNiPTR     "I64i"
+#else  // _WIN64 ][
+#  define SCNdPTR     "ld"
+#  define SCNiPTR     "li"
+#endif  // _WIN64 ]
+
+// The fscanf macros for unsigned integers are:
+#define SCNo8       "o"
+#define SCNu8       "u"
+#define SCNx8       "x"
+#define SCNX8       "X"
+#define SCNoLEAST8  "o"
+#define SCNuLEAST8  "u"
+#define SCNxLEAST8  "x"
+#define SCNXLEAST8  "X"
+#define SCNoFAST8   "o"
+#define SCNuFAST8   "u"
+#define SCNxFAST8   "x"
+#define SCNXFAST8   "X"
+
+#define SCNo16       "ho"
+#define SCNu16       "hu"
+#define SCNx16       "hx"
+#define SCNX16       "hX"
+#define SCNoLEAST16  "ho"
+#define SCNuLEAST16  "hu"
+#define SCNxLEAST16  "hx"
+#define SCNXLEAST16  "hX"
+#define SCNoFAST16   "ho"
+#define SCNuFAST16   "hu"
+#define SCNxFAST16   "hx"
+#define SCNXFAST16   "hX"
+
+#define SCNo32       "lo"
+#define SCNu32       "lu"
+#define SCNx32       "lx"
+#define SCNX32       "lX"
+#define SCNoLEAST32  "lo"
+#define SCNuLEAST32  "lu"
+#define SCNxLEAST32  "lx"
+#define SCNXLEAST32  "lX"
+#define SCNoFAST32   "lo"
+#define SCNuFAST32   "lu"
+#define SCNxFAST32   "lx"
+#define SCNXFAST32   "lX"
+
+#define SCNo64       "I64o"
+#define SCNu64       "I64u"
+#define SCNx64       "I64x"
+#define SCNX64       "I64X"
+#define SCNoLEAST64  "I64o"
+#define SCNuLEAST64  "I64u"
+#define SCNxLEAST64  "I64x"
+#define SCNXLEAST64  "I64X"
+#define SCNoFAST64   "I64o"
+#define SCNuFAST64   "I64u"
+#define SCNxFAST64   "I64x"
+#define SCNXFAST64   "I64X"
+
+#define SCNoMAX     "I64o"
+#define SCNuMAX     "I64u"
+#define SCNxMAX     "I64x"
+#define SCNXMAX     "I64X"
+
+#ifdef _WIN64 // [
+#  define SCNoPTR     "I64o"
+#  define SCNuPTR     "I64u"
+#  define SCNxPTR     "I64x"
+#  define SCNXPTR     "I64X"
+#else  // _WIN64 ][
+#  define SCNoPTR     "lo"
+#  define SCNuPTR     "lu"
+#  define SCNxPTR     "lx"
+#  define SCNXPTR     "lX"
+#endif  // _WIN64 ]
+
+#endif // __STDC_FORMAT_MACROS ]
+
+// 7.8.2 Functions for greatest-width integer types
+
+// 7.8.2.1 The imaxabs function
+#define imaxabs _abs64
+
+// 7.8.2.2 The imaxdiv function
+
+// This is modified version of div() function from Microsoft's div.c found
+// in %MSVC.NET%\crt\src\div.c
+#ifdef STATIC_IMAXDIV // [
+static
+#else // STATIC_IMAXDIV ][
+_inline
+#endif // STATIC_IMAXDIV ]
+imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
+{
+   imaxdiv_t result;
+
+   result.quot = numer / denom;
+   result.rem = numer % denom;
+
+   if (numer < 0 && result.rem > 0) {
+      // did division wrong; must fix up
+      ++result.quot;
+      result.rem -= denom;
+   }
+
+   return result;
+}
+
+// 7.8.2.3 The strtoimax and strtoumax functions
+#define strtoimax _strtoi64
+#define strtoumax _strtoui64
+
+// 7.8.2.4 The wcstoimax and wcstoumax functions
+#define wcstoimax _wcstoi64
+#define wcstoumax _wcstoui64
+
+#endif // _MSC_VER >= 1800
+
+#endif // _MSC_INTTYPES_H_ ]
diff --git a/include/rapidjson/msinttypes/stdint.h b/include/rapidjson/msinttypes/stdint.h
new file mode 100644
index 0000000000..3d4477b9a0
--- /dev/null
+++ b/include/rapidjson/msinttypes/stdint.h
@@ -0,0 +1,300 @@
+// ISO C9x  compliant stdint.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
+// 
+//  Copyright (c) 2006-2013 Alexander Chemeris
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 
+//   1. Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimer.
+// 
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+// 
+//   3. Neither the name of the product nor the names of its contributors may
+//      be used to endorse or promote products derived from this software
+//      without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+// The above software in this distribution may have been modified by 
+// THL A29 Limited ("Tencent Modifications"). 
+// All Tencent Modifications are Copyright (C) 2015 THL A29 Limited.
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_STDINT_H_ // [
+#define _MSC_STDINT_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+// miloyip: Originally Visual Studio 2010 uses its own stdint.h. However it generates warning with INT64_C(), so change to use this file for vs2010.
+#if _MSC_VER >= 1600 // [
+#include <stdint.h>
+
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
+
+#undef INT8_C
+#undef INT16_C
+#undef INT32_C
+#undef INT64_C
+#undef UINT8_C
+#undef UINT16_C
+#undef UINT32_C
+#undef UINT64_C
+
+// 7.18.4.1 Macros for minimum-width integer constants
+
+#define INT8_C(val)  val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+
+#define UINT8_C(val)  val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+
+// 7.18.4.2 Macros for greatest-width integer constants
+// These #ifndef's are needed to prevent collisions with <boost/cstdint.hpp>.
+// Check out Issue 9 for the details.
+#ifndef INTMAX_C //   [
+#  define INTMAX_C   INT64_C
+#endif // INTMAX_C    ]
+#ifndef UINTMAX_C //  [
+#  define UINTMAX_C  UINT64_C
+#endif // UINTMAX_C   ]
+
+#endif // __STDC_CONSTANT_MACROS ]
+
+#else // ] _MSC_VER >= 1700 [
+
+#include <limits.h>
+
+// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
+// compiling for ARM we have to wrap <wchar.h> include with 'extern "C++" {}'
+// or compiler would give many errors like this:
+//   error C2733: second C linkage of overloaded function 'wmemchr' not allowed
+#if defined(__cplusplus) && !defined(_M_ARM)
+extern "C" {
+#endif
+#  include <wchar.h>
+#if defined(__cplusplus) && !defined(_M_ARM)
+}
+#endif
+
+// Define _W64 macros to mark types changing their size, like intptr_t.
+#ifndef _W64
+#  if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
+#     define _W64 __w64
+#  else
+#     define _W64
+#  endif
+#endif
+
+
+// 7.18.1 Integer types
+
+// 7.18.1.1 Exact-width integer types
+
+// Visual Studio 6 and Embedded Visual C++ 4 doesn't
+// realize that, e.g. char has the same size as __int8
+// so we give up on __intX for them.
+#if (_MSC_VER < 1300)
+   typedef signed char       int8_t;
+   typedef signed short      int16_t;
+   typedef signed int        int32_t;
+   typedef unsigned char     uint8_t;
+   typedef unsigned short    uint16_t;
+   typedef unsigned int      uint32_t;
+#else
+   typedef signed __int8     int8_t;
+   typedef signed __int16    int16_t;
+   typedef signed __int32    int32_t;
+   typedef unsigned __int8   uint8_t;
+   typedef unsigned __int16  uint16_t;
+   typedef unsigned __int32  uint32_t;
+#endif
+typedef signed __int64       int64_t;
+typedef unsigned __int64     uint64_t;
+
+
+// 7.18.1.2 Minimum-width integer types
+typedef int8_t    int_least8_t;
+typedef int16_t   int_least16_t;
+typedef int32_t   int_least32_t;
+typedef int64_t   int_least64_t;
+typedef uint8_t   uint_least8_t;
+typedef uint16_t  uint_least16_t;
+typedef uint32_t  uint_least32_t;
+typedef uint64_t  uint_least64_t;
+
+// 7.18.1.3 Fastest minimum-width integer types
+typedef int8_t    int_fast8_t;
+typedef int16_t   int_fast16_t;
+typedef int32_t   int_fast32_t;
+typedef int64_t   int_fast64_t;
+typedef uint8_t   uint_fast8_t;
+typedef uint16_t  uint_fast16_t;
+typedef uint32_t  uint_fast32_t;
+typedef uint64_t  uint_fast64_t;
+
+// 7.18.1.4 Integer types capable of holding object pointers
+#ifdef _WIN64 // [
+   typedef signed __int64    intptr_t;
+   typedef unsigned __int64  uintptr_t;
+#else // _WIN64 ][
+   typedef _W64 signed int   intptr_t;
+   typedef _W64 unsigned int uintptr_t;
+#endif // _WIN64 ]
+
+// 7.18.1.5 Greatest-width integer types
+typedef int64_t   intmax_t;
+typedef uint64_t  uintmax_t;
+
+
+// 7.18.2 Limits of specified-width integer types
+
+#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [   See footnote 220 at page 257 and footnote 221 at page 259
+
+// 7.18.2.1 Limits of exact-width integer types
+#define INT8_MIN     ((int8_t)_I8_MIN)
+#define INT8_MAX     _I8_MAX
+#define INT16_MIN    ((int16_t)_I16_MIN)
+#define INT16_MAX    _I16_MAX
+#define INT32_MIN    ((int32_t)_I32_MIN)
+#define INT32_MAX    _I32_MAX
+#define INT64_MIN    ((int64_t)_I64_MIN)
+#define INT64_MAX    _I64_MAX
+#define UINT8_MAX    _UI8_MAX
+#define UINT16_MAX   _UI16_MAX
+#define UINT32_MAX   _UI32_MAX
+#define UINT64_MAX   _UI64_MAX
+
+// 7.18.2.2 Limits of minimum-width integer types
+#define INT_LEAST8_MIN    INT8_MIN
+#define INT_LEAST8_MAX    INT8_MAX
+#define INT_LEAST16_MIN   INT16_MIN
+#define INT_LEAST16_MAX   INT16_MAX
+#define INT_LEAST32_MIN   INT32_MIN
+#define INT_LEAST32_MAX   INT32_MAX
+#define INT_LEAST64_MIN   INT64_MIN
+#define INT_LEAST64_MAX   INT64_MAX
+#define UINT_LEAST8_MAX   UINT8_MAX
+#define UINT_LEAST16_MAX  UINT16_MAX
+#define UINT_LEAST32_MAX  UINT32_MAX
+#define UINT_LEAST64_MAX  UINT64_MAX
+
+// 7.18.2.3 Limits of fastest minimum-width integer types
+#define INT_FAST8_MIN    INT8_MIN
+#define INT_FAST8_MAX    INT8_MAX
+#define INT_FAST16_MIN   INT16_MIN
+#define INT_FAST16_MAX   INT16_MAX
+#define INT_FAST32_MIN   INT32_MIN
+#define INT_FAST32_MAX   INT32_MAX
+#define INT_FAST64_MIN   INT64_MIN
+#define INT_FAST64_MAX   INT64_MAX
+#define UINT_FAST8_MAX   UINT8_MAX
+#define UINT_FAST16_MAX  UINT16_MAX
+#define UINT_FAST32_MAX  UINT32_MAX
+#define UINT_FAST64_MAX  UINT64_MAX
+
+// 7.18.2.4 Limits of integer types capable of holding object pointers
+#ifdef _WIN64 // [
+#  define INTPTR_MIN   INT64_MIN
+#  define INTPTR_MAX   INT64_MAX
+#  define UINTPTR_MAX  UINT64_MAX
+#else // _WIN64 ][
+#  define INTPTR_MIN   INT32_MIN
+#  define INTPTR_MAX   INT32_MAX
+#  define UINTPTR_MAX  UINT32_MAX
+#endif // _WIN64 ]
+
+// 7.18.2.5 Limits of greatest-width integer types
+#define INTMAX_MIN   INT64_MIN
+#define INTMAX_MAX   INT64_MAX
+#define UINTMAX_MAX  UINT64_MAX
+
+// 7.18.3 Limits of other integer types
+
+#ifdef _WIN64 // [
+#  define PTRDIFF_MIN  _I64_MIN
+#  define PTRDIFF_MAX  _I64_MAX
+#else  // _WIN64 ][
+#  define PTRDIFF_MIN  _I32_MIN
+#  define PTRDIFF_MAX  _I32_MAX
+#endif  // _WIN64 ]
+
+#define SIG_ATOMIC_MIN  INT_MIN
+#define SIG_ATOMIC_MAX  INT_MAX
+
+#ifndef SIZE_MAX // [
+#  ifdef _WIN64 // [
+#     define SIZE_MAX  _UI64_MAX
+#  else // _WIN64 ][
+#     define SIZE_MAX  _UI32_MAX
+#  endif // _WIN64 ]
+#endif // SIZE_MAX ]
+
+// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
+#ifndef WCHAR_MIN // [
+#  define WCHAR_MIN  0
+#endif  // WCHAR_MIN ]
+#ifndef WCHAR_MAX // [
+#  define WCHAR_MAX  _UI16_MAX
+#endif  // WCHAR_MAX ]
+
+#define WINT_MIN  0
+#define WINT_MAX  _UI16_MAX
+
+#endif // __STDC_LIMIT_MACROS ]
+
+
+// 7.18.4 Limits of other integer types
+
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
+
+// 7.18.4.1 Macros for minimum-width integer constants
+
+#define INT8_C(val)  val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+
+#define UINT8_C(val)  val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+
+// 7.18.4.2 Macros for greatest-width integer constants
+// These #ifndef's are needed to prevent collisions with <boost/cstdint.hpp>.
+// Check out Issue 9 for the details.
+#ifndef INTMAX_C //   [
+#  define INTMAX_C   INT64_C
+#endif // INTMAX_C    ]
+#ifndef UINTMAX_C //  [
+#  define UINTMAX_C  UINT64_C
+#endif // UINTMAX_C   ]
+
+#endif // __STDC_CONSTANT_MACROS ]
+
+#endif // _MSC_VER >= 1600 ]
+
+#endif // _MSC_STDINT_H_ ]
diff --git a/include/rapidjson/ostreamwrapper.h b/include/rapidjson/ostreamwrapper.h
new file mode 100644
index 0000000000..11ed4d33f9
--- /dev/null
+++ b/include/rapidjson/ostreamwrapper.h
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_OSTREAMWRAPPER_H_
+#define RAPIDJSON_OSTREAMWRAPPER_H_
+
+#include "stream.h"
+#include <iosfwd>
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(padded)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Wrapper of \c std::basic_ostream into RapidJSON's Stream concept.
+/*!
+    The classes can be wrapped including but not limited to:
+
+    - \c std::ostringstream
+    - \c std::stringstream
+    - \c std::wpstringstream
+    - \c std::wstringstream
+    - \c std::ifstream
+    - \c std::fstream
+    - \c std::wofstream
+    - \c std::wfstream
+
+    \tparam StreamType Class derived from \c std::basic_ostream.
+*/
+   
+template <typename StreamType>
+class BasicOStreamWrapper {
+public:
+    typedef typename StreamType::char_type Ch;
+    BasicOStreamWrapper(StreamType& stream) : stream_(stream) {}
+
+    void Put(Ch c) {
+        stream_.put(c);
+    }
+
+    void Flush() {
+        stream_.flush();
+    }
+
+    // Not implemented
+    char Peek() const { RAPIDJSON_ASSERT(false); return 0; }
+    char Take() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
+    char* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    size_t PutEnd(char*) { RAPIDJSON_ASSERT(false); return 0; }
+
+private:
+    BasicOStreamWrapper(const BasicOStreamWrapper&);
+    BasicOStreamWrapper& operator=(const BasicOStreamWrapper&);
+
+    StreamType& stream_;
+};
+
+typedef BasicOStreamWrapper<std::ostream> OStreamWrapper;
+typedef BasicOStreamWrapper<std::wostream> WOStreamWrapper;
+
+#ifdef __clang__
+RAPIDJSON_DIAG_POP
+#endif
+
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_OSTREAMWRAPPER_H_
diff --git a/include/rapidjson/pointer.h b/include/rapidjson/pointer.h
new file mode 100644
index 0000000000..355929ede0
--- /dev/null
+++ b/include/rapidjson/pointer.h
@@ -0,0 +1,1482 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_POINTER_H_
+#define RAPIDJSON_POINTER_H_
+
+#include "document.h"
+#include "uri.h"
+#include "internal/itoa.h"
+#include "error/error.h" // PointerParseErrorCode
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(switch-enum)
+#elif defined(_MSC_VER)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated
+#endif
+
+#if defined(RAPIDJSON_CPLUSPLUS) && RAPIDJSON_CPLUSPLUS >= 201703L
+#define RAPIDJSON_IF_CONSTEXPR if constexpr
+#else
+#define RAPIDJSON_IF_CONSTEXPR if
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+static const SizeType kPointerInvalidIndex = ~SizeType(0);  //!< Represents an invalid index in GenericPointer::Token
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericPointer
+
+//! Represents a JSON Pointer. Use Pointer for UTF8 encoding and default allocator.
+/*!
+    This class implements RFC 6901 "JavaScript Object Notation (JSON) Pointer" 
+    (https://tools.ietf.org/html/rfc6901).
+
+    A JSON pointer is for identifying a specific value in a JSON document
+    (GenericDocument). It can simplify coding of DOM tree manipulation, because it
+    can access multiple-level depth of DOM tree with single API call.
+
+    After it parses a string representation (e.g. "/foo/0" or URI fragment 
+    representation (e.g. "#/foo/0") into its internal representation (tokens),
+    it can be used to resolve a specific value in multiple documents, or sub-tree 
+    of documents.
+
+    Contrary to GenericValue, Pointer can be copy constructed and copy assigned.
+    Apart from assignment, a Pointer cannot be modified after construction.
+
+    Although Pointer is very convenient, please aware that constructing Pointer
+    involves parsing and dynamic memory allocation. A special constructor with user-
+    supplied tokens eliminates these.
+
+    GenericPointer depends on GenericDocument and GenericValue.
+
+    \tparam ValueType The value type of the DOM tree. E.g. GenericValue<UTF8<> >
+    \tparam Allocator The allocator type for allocating memory for internal representation.
+
+    \note GenericPointer uses same encoding of ValueType.
+    However, Allocator of GenericPointer is independent of Allocator of Value.
+*/
+template <typename ValueType, typename Allocator = CrtAllocator>
+class GenericPointer {
+public:
+    typedef typename ValueType::EncodingType EncodingType;  //!< Encoding type from Value
+    typedef typename ValueType::Ch Ch;                      //!< Character type from Value
+    typedef GenericUri<ValueType, Allocator> UriType;
+
+
+  //! A token is the basic units of internal representation.
+    /*!
+        A JSON pointer string representation "/foo/123" is parsed to two tokens: 
+        "foo" and 123. 123 will be represented in both numeric form and string form.
+        They are resolved according to the actual value type (object or array).
+
+        For token that are not numbers, or the numeric value is out of bound
+        (greater than limits of SizeType), they are only treated as string form
+        (i.e. the token's index will be equal to kPointerInvalidIndex).
+
+        This struct is public so that user can create a Pointer without parsing and 
+        allocation, using a special constructor.
+    */
+    struct Token {
+        const Ch* name;             //!< Name of the token. It has null character at the end but it can contain null character.
+        SizeType length;            //!< Length of the name.
+        SizeType index;             //!< A valid array index, if it is not equal to kPointerInvalidIndex.
+    };
+
+    //!@name Constructors and destructor.
+    //@{
+
+    //! Default constructor.
+    GenericPointer(Allocator* allocator = 0) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) {}
+
+    //! Constructor that parses a string or URI fragment representation.
+    /*!
+        \param source A null-terminated, string or URI fragment representation of JSON pointer.
+        \param allocator User supplied allocator for this pointer. If no allocator is provided, it creates a self-owned one.
+    */
+    explicit GenericPointer(const Ch* source, Allocator* allocator = 0) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) {
+        Parse(source, internal::StrLen(source));
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Constructor that parses a string or URI fragment representation.
+    /*!
+        \param source A string or URI fragment representation of JSON pointer.
+        \param allocator User supplied allocator for this pointer. If no allocator is provided, it creates a self-owned one.
+        \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING.
+    */
+    explicit GenericPointer(const std::basic_string<Ch>& source, Allocator* allocator = 0) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) {
+        Parse(source.c_str(), source.size());
+    }
+#endif
+
+    //! Constructor that parses a string or URI fragment representation, with length of the source string.
+    /*!
+        \param source A string or URI fragment representation of JSON pointer.
+        \param length Length of source.
+        \param allocator User supplied allocator for this pointer. If no allocator is provided, it creates a self-owned one.
+        \note Slightly faster than the overload without length.
+    */
+    GenericPointer(const Ch* source, size_t length, Allocator* allocator = 0) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) {
+        Parse(source, length);
+    }
+
+    //! Constructor with user-supplied tokens.
+    /*!
+        This constructor let user supplies const array of tokens.
+        This prevents the parsing process and eliminates allocation.
+        This is preferred for memory constrained environments.
+
+        \param tokens An constant array of tokens representing the JSON pointer.
+        \param tokenCount Number of tokens.
+
+        \b Example
+        \code
+        #define NAME(s) { s, sizeof(s) / sizeof(s[0]) - 1, kPointerInvalidIndex }
+        #define INDEX(i) { #i, sizeof(#i) - 1, i }
+
+        static const Pointer::Token kTokens[] = { NAME("foo"), INDEX(123) };
+        static const Pointer p(kTokens, sizeof(kTokens) / sizeof(kTokens[0]));
+        // Equivalent to static const Pointer p("/foo/123");
+
+        #undef NAME
+        #undef INDEX
+        \endcode
+    */
+    GenericPointer(const Token* tokens, size_t tokenCount) : allocator_(), ownAllocator_(), nameBuffer_(), tokens_(const_cast<Token*>(tokens)), tokenCount_(tokenCount), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) {}
+
+    //! Copy constructor.
+    GenericPointer(const GenericPointer& rhs) : allocator_(), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) {
+        *this = rhs;
+    }
+
+    //! Copy constructor.
+    GenericPointer(const GenericPointer& rhs, Allocator* allocator) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) {
+        *this = rhs;
+    }
+
+    //! Destructor.
+    ~GenericPointer() {
+        if (nameBuffer_)    // If user-supplied tokens constructor is used, nameBuffer_ is nullptr and tokens_ are not deallocated.
+            Allocator::Free(tokens_);
+        RAPIDJSON_DELETE(ownAllocator_);
+    }
+
+    //! Assignment operator.
+    GenericPointer& operator=(const GenericPointer& rhs) {
+        if (this != &rhs) {
+            // Do not delete ownAllcator
+            if (nameBuffer_)
+                Allocator::Free(tokens_);
+
+            tokenCount_ = rhs.tokenCount_;
+            parseErrorOffset_ = rhs.parseErrorOffset_;
+            parseErrorCode_ = rhs.parseErrorCode_;
+
+            if (rhs.nameBuffer_)
+                CopyFromRaw(rhs); // Normally parsed tokens.
+            else {
+                tokens_ = rhs.tokens_; // User supplied const tokens.
+                nameBuffer_ = 0;
+            }
+        }
+        return *this;
+    }
+
+    //! Swap the content of this pointer with an other.
+    /*!
+        \param other The pointer to swap with.
+        \note Constant complexity.
+    */
+    GenericPointer& Swap(GenericPointer& other) RAPIDJSON_NOEXCEPT {
+        internal::Swap(allocator_, other.allocator_);
+        internal::Swap(ownAllocator_, other.ownAllocator_);
+        internal::Swap(nameBuffer_, other.nameBuffer_);
+        internal::Swap(tokens_, other.tokens_);
+        internal::Swap(tokenCount_, other.tokenCount_);
+        internal::Swap(parseErrorOffset_, other.parseErrorOffset_);
+        internal::Swap(parseErrorCode_, other.parseErrorCode_);
+        return *this;
+    }
+
+    //! free-standing swap function helper
+    /*!
+        Helper function to enable support for common swap implementation pattern based on \c std::swap:
+        \code
+        void swap(MyClass& a, MyClass& b) {
+            using std::swap;
+            swap(a.pointer, b.pointer);
+            // ...
+        }
+        \endcode
+        \see Swap()
+     */
+    friend inline void swap(GenericPointer& a, GenericPointer& b) RAPIDJSON_NOEXCEPT { a.Swap(b); }
+
+    //@}
+
+    //!@name Append token
+    //@{
+
+    //! Append a token and return a new Pointer
+    /*!
+        \param token Token to be appended.
+        \param allocator Allocator for the newly return Pointer.
+        \return A new Pointer with appended token.
+    */
+    GenericPointer Append(const Token& token, Allocator* allocator = 0) const {
+        GenericPointer r;
+        r.allocator_ = allocator;
+        Ch *p = r.CopyFromRaw(*this, 1, token.length + 1);
+        std::memcpy(p, token.name, (token.length + 1) * sizeof(Ch));
+        r.tokens_[tokenCount_].name = p;
+        r.tokens_[tokenCount_].length = token.length;
+        r.tokens_[tokenCount_].index = token.index;
+        return r;
+    }
+
+    //! Append a name token with length, and return a new Pointer
+    /*!
+        \param name Name to be appended.
+        \param length Length of name.
+        \param allocator Allocator for the newly return Pointer.
+        \return A new Pointer with appended token.
+    */
+    GenericPointer Append(const Ch* name, SizeType length, Allocator* allocator = 0) const {
+        Token token = { name, length, kPointerInvalidIndex };
+        return Append(token, allocator);
+    }
+
+    //! Append a name token without length, and return a new Pointer
+    /*!
+        \param name Name (const Ch*) to be appended.
+        \param allocator Allocator for the newly return Pointer.
+        \return A new Pointer with appended token.
+    */
+    template <typename T>
+    RAPIDJSON_DISABLEIF_RETURN((internal::NotExpr<internal::IsSame<typename internal::RemoveConst<T>::Type, Ch> >), (GenericPointer))
+    Append(T* name, Allocator* allocator = 0) const {
+        return Append(name, internal::StrLen(name), allocator);
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Append a name token, and return a new Pointer
+    /*!
+        \param name Name to be appended.
+        \param allocator Allocator for the newly return Pointer.
+        \return A new Pointer with appended token.
+    */
+    GenericPointer Append(const std::basic_string<Ch>& name, Allocator* allocator = 0) const {
+        return Append(name.c_str(), static_cast<SizeType>(name.size()), allocator);
+    }
+#endif
+
+    //! Append a index token, and return a new Pointer
+    /*!
+        \param index Index to be appended.
+        \param allocator Allocator for the newly return Pointer.
+        \return A new Pointer with appended token.
+    */
+    GenericPointer Append(SizeType index, Allocator* allocator = 0) const {
+        char buffer[21];
+        char* end = sizeof(SizeType) == 4 ? internal::u32toa(index, buffer) : internal::u64toa(index, buffer);
+        SizeType length = static_cast<SizeType>(end - buffer);
+        buffer[length] = '\0';
+
+        RAPIDJSON_IF_CONSTEXPR (sizeof(Ch) == 1) {
+            Token token = { reinterpret_cast<Ch*>(buffer), length, index };
+            return Append(token, allocator);
+        }
+        else {
+            Ch name[21];
+            for (size_t i = 0; i <= length; i++)
+                name[i] = static_cast<Ch>(buffer[i]);
+            Token token = { name, length, index };
+            return Append(token, allocator);
+        }
+    }
+
+    //! Append a token by value, and return a new Pointer
+    /*!
+        \param token token to be appended.
+        \param allocator Allocator for the newly return Pointer.
+        \return A new Pointer with appended token.
+    */
+    GenericPointer Append(const ValueType& token, Allocator* allocator = 0) const {
+        if (token.IsString())
+            return Append(token.GetString(), token.GetStringLength(), allocator);
+        else {
+            RAPIDJSON_ASSERT(token.IsUint64());
+            RAPIDJSON_ASSERT(token.GetUint64() <= SizeType(~0));
+            return Append(static_cast<SizeType>(token.GetUint64()), allocator);
+        }
+    }
+
+    //!@name Handling Parse Error
+    //@{
+
+    //! Check whether this is a valid pointer.
+    bool IsValid() const { return parseErrorCode_ == kPointerParseErrorNone; }
+
+    //! Get the parsing error offset in code unit.
+    size_t GetParseErrorOffset() const { return parseErrorOffset_; }
+
+    //! Get the parsing error code.
+    PointerParseErrorCode GetParseErrorCode() const { return parseErrorCode_; }
+
+    //@}
+
+    //! Get the allocator of this pointer.
+    Allocator& GetAllocator() { return *allocator_; }
+
+    //!@name Tokens
+    //@{
+
+    //! Get the token array (const version only).
+    const Token* GetTokens() const { return tokens_; }
+
+    //! Get the number of tokens.
+    size_t GetTokenCount() const { return tokenCount_; }
+
+    //@}
+
+    //!@name Equality/inequality operators
+    //@{
+
+    //! Equality operator.
+    /*!
+        \note When any pointers are invalid, always returns false.
+    */
+    bool operator==(const GenericPointer& rhs) const {
+        if (!IsValid() || !rhs.IsValid() || tokenCount_ != rhs.tokenCount_)
+            return false;
+
+        for (size_t i = 0; i < tokenCount_; i++) {
+            if (tokens_[i].index != rhs.tokens_[i].index ||
+                tokens_[i].length != rhs.tokens_[i].length || 
+                (tokens_[i].length != 0 && std::memcmp(tokens_[i].name, rhs.tokens_[i].name, sizeof(Ch)* tokens_[i].length) != 0))
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    //! Inequality operator.
+    /*!
+        \note When any pointers are invalid, always returns true.
+    */
+    bool operator!=(const GenericPointer& rhs) const { return !(*this == rhs); }
+
+    //! Less than operator.
+    /*!
+        \note Invalid pointers are always greater than valid ones.
+    */
+    bool operator<(const GenericPointer& rhs) const {
+        if (!IsValid())
+            return false;
+        if (!rhs.IsValid())
+            return true;
+
+        if (tokenCount_ != rhs.tokenCount_)
+            return tokenCount_ < rhs.tokenCount_;
+
+        for (size_t i = 0; i < tokenCount_; i++) {
+            if (tokens_[i].index != rhs.tokens_[i].index)
+                return tokens_[i].index < rhs.tokens_[i].index;
+
+            if (tokens_[i].length != rhs.tokens_[i].length)
+                return tokens_[i].length < rhs.tokens_[i].length;
+
+            if (int cmp = std::memcmp(tokens_[i].name, rhs.tokens_[i].name, sizeof(Ch) * tokens_[i].length))
+                return cmp < 0;
+        }
+
+        return false;
+    }
+
+    //@}
+
+    //!@name Stringify
+    //@{
+
+    //! Stringify the pointer into string representation.
+    /*!
+        \tparam OutputStream Type of output stream.
+        \param os The output stream.
+    */
+    template<typename OutputStream>
+    bool Stringify(OutputStream& os) const {
+        return Stringify<false, OutputStream>(os);
+    }
+
+    //! Stringify the pointer into URI fragment representation.
+    /*!
+        \tparam OutputStream Type of output stream.
+        \param os The output stream.
+    */
+    template<typename OutputStream>
+    bool StringifyUriFragment(OutputStream& os) const {
+        return Stringify<true, OutputStream>(os);
+    }
+
+    //@}
+
+    //!@name Create value
+    //@{
+
+    //! Create a value in a subtree.
+    /*!
+        If the value is not exist, it creates all parent values and a JSON Null value.
+        So it always succeed and return the newly created or existing value.
+
+        Remind that it may change types of parents according to tokens, so it 
+        potentially removes previously stored values. For example, if a document 
+        was an array, and "/foo" is used to create a value, then the document 
+        will be changed to an object, and all existing array elements are lost.
+
+        \param root Root value of a DOM subtree to be resolved. It can be any value other than document root.
+        \param allocator Allocator for creating the values if the specified value or its parents are not exist.
+        \param alreadyExist If non-null, it stores whether the resolved value is already exist.
+        \return The resolved newly created (a JSON Null value), or already exists value.
+    */
+    ValueType& Create(ValueType& root, typename ValueType::AllocatorType& allocator, bool* alreadyExist = 0) const {
+        RAPIDJSON_ASSERT(IsValid());
+        ValueType* v = &root;
+        bool exist = true;
+        for (const Token *t = tokens_; t != tokens_ + tokenCount_; ++t) {
+            if (v->IsArray() && t->name[0] == '-' && t->length == 1) {
+                v->PushBack(ValueType().Move(), allocator);
+                v = &((*v)[v->Size() - 1]);
+                exist = false;
+            }
+            else {
+                if (t->index == kPointerInvalidIndex) { // must be object name
+                    if (!v->IsObject())
+                        v->SetObject(); // Change to Object
+                }
+                else { // object name or array index
+                    if (!v->IsArray() && !v->IsObject())
+                        v->SetArray(); // Change to Array
+                }
+
+                if (v->IsArray()) {
+                    if (t->index >= v->Size()) {
+                        v->Reserve(t->index + 1, allocator);
+                        while (t->index >= v->Size())
+                            v->PushBack(ValueType().Move(), allocator);
+                        exist = false;
+                    }
+                    v = &((*v)[t->index]);
+                }
+                else {
+                    typename ValueType::MemberIterator m = v->FindMember(GenericValue<EncodingType>(GenericStringRef<Ch>(t->name, t->length)));
+                    if (m == v->MemberEnd()) {
+                        v->AddMember(ValueType(t->name, t->length, allocator).Move(), ValueType().Move(), allocator);
+                        m = v->MemberEnd();
+                        v = &(--m)->value; // Assumes AddMember() appends at the end
+                        exist = false;
+                    }
+                    else
+                        v = &m->value;
+                }
+            }
+        }
+
+        if (alreadyExist)
+            *alreadyExist = exist;
+
+        return *v;
+    }
+
+    //! Creates a value in a document.
+    /*!
+        \param document A document to be resolved.
+        \param alreadyExist If non-null, it stores whether the resolved value is already exist.
+        \return The resolved newly created, or already exists value.
+    */
+    template <typename stackAllocator>
+    ValueType& Create(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, bool* alreadyExist = 0) const {
+        return Create(document, document.GetAllocator(), alreadyExist);
+    }
+
+    //@}
+
+    //!@name Compute URI
+    //@{
+
+    //! Compute the in-scope URI for a subtree.
+    //  For use with JSON pointers into JSON schema documents.
+    /*!
+        \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root.
+        \param rootUri Root URI
+        \param unresolvedTokenIndex If the pointer cannot resolve a token in the pointer, this parameter can obtain the index of unresolved token.
+        \param allocator Allocator for Uris
+        \return Uri if it can be resolved. Otherwise null.
+
+        \note
+        There are only 3 situations when a URI cannot be resolved:
+        1. A value in the path is not an array nor object.
+        2. An object value does not contain the token.
+        3. A token is out of range of an array value.
+
+        Use unresolvedTokenIndex to retrieve the token index.
+    */
+    UriType GetUri(ValueType& root, const UriType& rootUri, size_t* unresolvedTokenIndex = 0, Allocator* allocator = 0) const {
+        static const Ch kIdString[] = { 'i', 'd', '\0' };
+        static const ValueType kIdValue(kIdString, 2);
+        UriType base = UriType(rootUri, allocator);
+        RAPIDJSON_ASSERT(IsValid());
+        ValueType* v = &root;
+        for (const Token *t = tokens_; t != tokens_ + tokenCount_; ++t) {
+            switch (v->GetType()) {
+                case kObjectType:
+                {
+                    // See if we have an id, and if so resolve with the current base
+                    typename ValueType::MemberIterator m = v->FindMember(kIdValue);
+                    if (m != v->MemberEnd() && (m->value).IsString()) {
+                        UriType here = UriType(m->value, allocator).Resolve(base, allocator);
+                        base = here;
+                    }
+                    m = v->FindMember(GenericValue<EncodingType>(GenericStringRef<Ch>(t->name, t->length)));
+                    if (m == v->MemberEnd())
+                        break;
+                    v = &m->value;
+                }
+                  continue;
+                case kArrayType:
+                    if (t->index == kPointerInvalidIndex || t->index >= v->Size())
+                        break;
+                    v = &((*v)[t->index]);
+                    continue;
+                default:
+                    break;
+            }
+
+            // Error: unresolved token
+            if (unresolvedTokenIndex)
+                *unresolvedTokenIndex = static_cast<size_t>(t - tokens_);
+            return UriType(allocator);
+        }
+        return base;
+    }
+
+    UriType GetUri(const ValueType& root, const UriType& rootUri, size_t* unresolvedTokenIndex = 0, Allocator* allocator = 0) const {
+      return GetUri(const_cast<ValueType&>(root), rootUri, unresolvedTokenIndex, allocator);
+    }
+
+
+    //!@name Query value
+    //@{
+
+    //! Query a value in a subtree.
+    /*!
+        \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root.
+        \param unresolvedTokenIndex If the pointer cannot resolve a token in the pointer, this parameter can obtain the index of unresolved token.
+        \return Pointer to the value if it can be resolved. Otherwise null.
+
+        \note
+        There are only 3 situations when a value cannot be resolved:
+        1. A value in the path is not an array nor object.
+        2. An object value does not contain the token.
+        3. A token is out of range of an array value.
+
+        Use unresolvedTokenIndex to retrieve the token index.
+    */
+    ValueType* Get(ValueType& root, size_t* unresolvedTokenIndex = 0) const {
+        RAPIDJSON_ASSERT(IsValid());
+        ValueType* v = &root;
+        for (const Token *t = tokens_; t != tokens_ + tokenCount_; ++t) {
+            switch (v->GetType()) {
+            case kObjectType:
+                {
+                    typename ValueType::MemberIterator m = v->FindMember(GenericValue<EncodingType>(GenericStringRef<Ch>(t->name, t->length)));
+                    if (m == v->MemberEnd())
+                        break;
+                    v = &m->value;
+                }
+                continue;
+            case kArrayType:
+                if (t->index == kPointerInvalidIndex || t->index >= v->Size())
+                    break;
+                v = &((*v)[t->index]);
+                continue;
+            default:
+                break;
+            }
+
+            // Error: unresolved token
+            if (unresolvedTokenIndex)
+                *unresolvedTokenIndex = static_cast<size_t>(t - tokens_);
+            return 0;
+        }
+        return v;
+    }
+
+    //! Query a const value in a const subtree.
+    /*!
+        \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root.
+        \return Pointer to the value if it can be resolved. Otherwise null.
+    */
+    const ValueType* Get(const ValueType& root, size_t* unresolvedTokenIndex = 0) const { 
+        return Get(const_cast<ValueType&>(root), unresolvedTokenIndex);
+    }
+
+    //@}
+
+    //!@name Query a value with default
+    //@{
+
+    //! Query a value in a subtree with default value.
+    /*!
+        Similar to Get(), but if the specified value do not exists, it creates all parents and clone the default value.
+        So that this function always succeed.
+
+        \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root.
+        \param defaultValue Default value to be cloned if the value was not exists.
+        \param allocator Allocator for creating the values if the specified value or its parents are not exist.
+        \see Create()
+    */
+    ValueType& GetWithDefault(ValueType& root, const ValueType& defaultValue, typename ValueType::AllocatorType& allocator) const {
+        bool alreadyExist;
+        ValueType& v = Create(root, allocator, &alreadyExist);
+        return alreadyExist ? v : v.CopyFrom(defaultValue, allocator);
+    }
+
+    //! Query a value in a subtree with default null-terminated string.
+    ValueType& GetWithDefault(ValueType& root, const Ch* defaultValue, typename ValueType::AllocatorType& allocator) const {
+        bool alreadyExist;
+        ValueType& v = Create(root, allocator, &alreadyExist);
+        return alreadyExist ? v : v.SetString(defaultValue, allocator);
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Query a value in a subtree with default std::basic_string.
+    ValueType& GetWithDefault(ValueType& root, const std::basic_string<Ch>& defaultValue, typename ValueType::AllocatorType& allocator) const {
+        bool alreadyExist;
+        ValueType& v = Create(root, allocator, &alreadyExist);
+        return alreadyExist ? v : v.SetString(defaultValue, allocator);
+    }
+#endif
+
+    //! Query a value in a subtree with default primitive value.
+    /*!
+        \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c bool
+    */
+    template <typename T>
+    RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (ValueType&))
+    GetWithDefault(ValueType& root, T defaultValue, typename ValueType::AllocatorType& allocator) const {
+        return GetWithDefault(root, ValueType(defaultValue).Move(), allocator);
+    }
+
+    //! Query a value in a document with default value.
+    template <typename stackAllocator>
+    ValueType& GetWithDefault(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, const ValueType& defaultValue) const {
+        return GetWithDefault(document, defaultValue, document.GetAllocator());
+    }
+
+    //! Query a value in a document with default null-terminated string.
+    template <typename stackAllocator>
+    ValueType& GetWithDefault(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, const Ch* defaultValue) const {
+        return GetWithDefault(document, defaultValue, document.GetAllocator());
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Query a value in a document with default std::basic_string.
+    template <typename stackAllocator>
+    ValueType& GetWithDefault(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, const std::basic_string<Ch>& defaultValue) const {
+        return GetWithDefault(document, defaultValue, document.GetAllocator());
+    }
+#endif
+
+    //! Query a value in a document with default primitive value.
+    /*!
+        \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c bool
+    */
+    template <typename T, typename stackAllocator>
+    RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (ValueType&))
+    GetWithDefault(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, T defaultValue) const {
+        return GetWithDefault(document, defaultValue, document.GetAllocator());
+    }
+
+    //@}
+
+    //!@name Set a value
+    //@{
+
+    //! Set a value in a subtree, with move semantics.
+    /*!
+        It creates all parents if they are not exist or types are different to the tokens.
+        So this function always succeeds but potentially remove existing values.
+
+        \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root.
+        \param value Value to be set.
+        \param allocator Allocator for creating the values if the specified value or its parents are not exist.
+        \see Create()
+    */
+    ValueType& Set(ValueType& root, ValueType& value, typename ValueType::AllocatorType& allocator) const {
+        return Create(root, allocator) = value;
+    }
+
+    //! Set a value in a subtree, with copy semantics.
+    ValueType& Set(ValueType& root, const ValueType& value, typename ValueType::AllocatorType& allocator) const {
+        return Create(root, allocator).CopyFrom(value, allocator);
+    }
+
+    //! Set a null-terminated string in a subtree.
+    ValueType& Set(ValueType& root, const Ch* value, typename ValueType::AllocatorType& allocator) const {
+        return Create(root, allocator) = ValueType(value, allocator).Move();
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Set a std::basic_string in a subtree.
+    ValueType& Set(ValueType& root, const std::basic_string<Ch>& value, typename ValueType::AllocatorType& allocator) const {
+        return Create(root, allocator) = ValueType(value, allocator).Move();
+    }
+#endif
+
+    //! Set a primitive value in a subtree.
+    /*!
+        \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c bool
+    */
+    template <typename T>
+    RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (ValueType&))
+    Set(ValueType& root, T value, typename ValueType::AllocatorType& allocator) const {
+        return Create(root, allocator) = ValueType(value).Move();
+    }
+
+    //! Set a value in a document, with move semantics.
+    template <typename stackAllocator>
+    ValueType& Set(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, ValueType& value) const {
+        return Create(document) = value;
+    }
+
+    //! Set a value in a document, with copy semantics.
+    template <typename stackAllocator>
+    ValueType& Set(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, const ValueType& value) const {
+        return Create(document).CopyFrom(value, document.GetAllocator());
+    }
+
+    //! Set a null-terminated string in a document.
+    template <typename stackAllocator>
+    ValueType& Set(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, const Ch* value) const {
+        return Create(document) = ValueType(value, document.GetAllocator()).Move();
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    //! Sets a std::basic_string in a document.
+    template <typename stackAllocator>
+    ValueType& Set(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, const std::basic_string<Ch>& value) const {
+        return Create(document) = ValueType(value, document.GetAllocator()).Move();
+    }
+#endif
+
+    //! Set a primitive value in a document.
+    /*!
+    \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c bool
+    */
+    template <typename T, typename stackAllocator>
+    RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (ValueType&))
+        Set(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, T value) const {
+            return Create(document) = value;
+    }
+
+    //@}
+
+    //!@name Swap a value
+    //@{
+
+    //! Swap a value with a value in a subtree.
+    /*!
+        It creates all parents if they are not exist or types are different to the tokens.
+        So this function always succeeds but potentially remove existing values.
+
+        \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root.
+        \param value Value to be swapped.
+        \param allocator Allocator for creating the values if the specified value or its parents are not exist.
+        \see Create()
+    */
+    ValueType& Swap(ValueType& root, ValueType& value, typename ValueType::AllocatorType& allocator) const {
+        return Create(root, allocator).Swap(value);
+    }
+
+    //! Swap a value with a value in a document.
+    template <typename stackAllocator>
+    ValueType& Swap(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, ValueType& value) const {
+        return Create(document).Swap(value);
+    }
+
+    //@}
+
+    //! Erase a value in a subtree.
+    /*!
+        \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root.
+        \return Whether the resolved value is found and erased.
+
+        \note Erasing with an empty pointer \c Pointer(""), i.e. the root, always fail and return false.
+    */
+    bool Erase(ValueType& root) const {
+        RAPIDJSON_ASSERT(IsValid());
+        if (tokenCount_ == 0) // Cannot erase the root
+            return false;
+
+        ValueType* v = &root;
+        const Token* last = tokens_ + (tokenCount_ - 1);
+        for (const Token *t = tokens_; t != last; ++t) {
+            switch (v->GetType()) {
+            case kObjectType:
+                {
+                    typename ValueType::MemberIterator m = v->FindMember(GenericValue<EncodingType>(GenericStringRef<Ch>(t->name, t->length)));
+                    if (m == v->MemberEnd())
+                        return false;
+                    v = &m->value;
+                }
+                break;
+            case kArrayType:
+                if (t->index == kPointerInvalidIndex || t->index >= v->Size())
+                    return false;
+                v = &((*v)[t->index]);
+                break;
+            default:
+                return false;
+            }
+        }
+
+        switch (v->GetType()) {
+        case kObjectType:
+            return v->EraseMember(GenericStringRef<Ch>(last->name, last->length));
+        case kArrayType:
+            if (last->index == kPointerInvalidIndex || last->index >= v->Size())
+                return false;
+            v->Erase(v->Begin() + last->index);
+            return true;
+        default:
+            return false;
+        }
+    }
+
+private:
+    //! Clone the content from rhs to this.
+    /*!
+        \param rhs Source pointer.
+        \param extraToken Extra tokens to be allocated.
+        \param extraNameBufferSize Extra name buffer size (in number of Ch) to be allocated.
+        \return Start of non-occupied name buffer, for storing extra names.
+    */
+    Ch* CopyFromRaw(const GenericPointer& rhs, size_t extraToken = 0, size_t extraNameBufferSize = 0) {
+        if (!allocator_) // allocator is independently owned.
+            ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)();
+
+        size_t nameBufferSize = rhs.tokenCount_; // null terminators for tokens
+        for (Token *t = rhs.tokens_; t != rhs.tokens_ + rhs.tokenCount_; ++t)
+            nameBufferSize += t->length;
+
+        tokenCount_ = rhs.tokenCount_ + extraToken;
+        tokens_ = static_cast<Token *>(allocator_->Malloc(tokenCount_ * sizeof(Token) + (nameBufferSize + extraNameBufferSize) * sizeof(Ch)));
+        nameBuffer_ = reinterpret_cast<Ch *>(tokens_ + tokenCount_);
+        if (rhs.tokenCount_ > 0) {
+            std::memcpy(tokens_, rhs.tokens_, rhs.tokenCount_ * sizeof(Token));
+        }
+        if (nameBufferSize > 0) {
+            std::memcpy(nameBuffer_, rhs.nameBuffer_, nameBufferSize * sizeof(Ch));
+        }
+
+        // The names of each token point to a string in the nameBuffer_. The
+        // previous memcpy copied over string pointers into the rhs.nameBuffer_,
+        // but they should point to the strings in the new nameBuffer_.
+        for (size_t i = 0; i < rhs.tokenCount_; ++i) {
+          // The offset between the string address and the name buffer should
+          // still be constant, so we can just get this offset and set each new
+          // token name according the new buffer start + the known offset.
+          std::ptrdiff_t name_offset = rhs.tokens_[i].name - rhs.nameBuffer_;
+          tokens_[i].name = nameBuffer_ + name_offset;
+        }
+
+        return nameBuffer_ + nameBufferSize;
+    }
+
+    //! Check whether a character should be percent-encoded.
+    /*!
+        According to RFC 3986 2.3 Unreserved Characters.
+        \param c The character (code unit) to be tested.
+    */
+    bool NeedPercentEncode(Ch c) const {
+        return !((c >= '0' && c <= '9') || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '_' || c =='~');
+    }
+
+    //! Parse a JSON String or its URI fragment representation into tokens.
+#ifndef __clang__ // -Wdocumentation
+    /*!
+        \param source Either a JSON Pointer string, or its URI fragment representation. Not need to be null terminated.
+        \param length Length of the source string.
+        \note Source cannot be JSON String Representation of JSON Pointer, e.g. In "/\u0000", \u0000 will not be unescaped.
+    */
+#endif
+    void Parse(const Ch* source, size_t length) {
+        RAPIDJSON_ASSERT(source != NULL);
+        RAPIDJSON_ASSERT(nameBuffer_ == 0);
+        RAPIDJSON_ASSERT(tokens_ == 0);
+
+        // Create own allocator if user did not supply.
+        if (!allocator_)
+            ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)();
+
+        // Count number of '/' as tokenCount
+        tokenCount_ = 0;
+        for (const Ch* s = source; s != source + length; s++) 
+            if (*s == '/')
+                tokenCount_++;
+
+        Token* token = tokens_ = static_cast<Token *>(allocator_->Malloc(tokenCount_ * sizeof(Token) + length * sizeof(Ch)));
+        Ch* name = nameBuffer_ = reinterpret_cast<Ch *>(tokens_ + tokenCount_);
+        size_t i = 0;
+
+        // Detect if it is a URI fragment
+        bool uriFragment = false;
+        if (source[i] == '#') {
+            uriFragment = true;
+            i++;
+        }
+
+        if (i != length && source[i] != '/') {
+            parseErrorCode_ = kPointerParseErrorTokenMustBeginWithSolidus;
+            goto error;
+        }
+
+        while (i < length) {
+            RAPIDJSON_ASSERT(source[i] == '/');
+            i++; // consumes '/'
+
+            token->name = name;
+            bool isNumber = true;
+
+            while (i < length && source[i] != '/') {
+                Ch c = source[i];
+                if (uriFragment) {
+                    // Decoding percent-encoding for URI fragment
+                    if (c == '%') {
+                        PercentDecodeStream is(&source[i], source + length);
+                        GenericInsituStringStream<EncodingType> os(name);
+                        Ch* begin = os.PutBegin();
+                        if (!Transcoder<UTF8<>, EncodingType>().Validate(is, os) || !is.IsValid()) {
+                            parseErrorCode_ = kPointerParseErrorInvalidPercentEncoding;
+                            goto error;
+                        }
+                        size_t len = os.PutEnd(begin);
+                        i += is.Tell() - 1;
+                        if (len == 1)
+                            c = *name;
+                        else {
+                            name += len;
+                            isNumber = false;
+                            i++;
+                            continue;
+                        }
+                    }
+                    else if (NeedPercentEncode(c)) {
+                        parseErrorCode_ = kPointerParseErrorCharacterMustPercentEncode;
+                        goto error;
+                    }
+                }
+
+                i++;
+
+                // Escaping "~0" -> '~', "~1" -> '/'
+                if (c == '~') {
+                    if (i < length) {
+                        c = source[i];
+                        if (c == '0')       c = '~';
+                        else if (c == '1')  c = '/';
+                        else {
+                            parseErrorCode_ = kPointerParseErrorInvalidEscape;
+                            goto error;
+                        }
+                        i++;
+                    }
+                    else {
+                        parseErrorCode_ = kPointerParseErrorInvalidEscape;
+                        goto error;
+                    }
+                }
+
+                // First check for index: all of characters are digit
+                if (c < '0' || c > '9')
+                    isNumber = false;
+
+                *name++ = c;
+            }
+            token->length = static_cast<SizeType>(name - token->name);
+            if (token->length == 0)
+                isNumber = false;
+            *name++ = '\0'; // Null terminator
+
+            // Second check for index: more than one digit cannot have leading zero
+            if (isNumber && token->length > 1 && token->name[0] == '0')
+                isNumber = false;
+
+            // String to SizeType conversion
+            SizeType n = 0;
+            if (isNumber) {
+                for (size_t j = 0; j < token->length; j++) {
+                    SizeType m = n * 10 + static_cast<SizeType>(token->name[j] - '0');
+                    if (m < n) {   // overflow detection
+                        isNumber = false;
+                        break;
+                    }
+                    n = m;
+                }
+            }
+
+            token->index = isNumber ? n : kPointerInvalidIndex;
+            token++;
+        }
+
+        RAPIDJSON_ASSERT(name <= nameBuffer_ + length); // Should not overflow buffer
+        parseErrorCode_ = kPointerParseErrorNone;
+        return;
+
+    error:
+        Allocator::Free(tokens_);
+        nameBuffer_ = 0;
+        tokens_ = 0;
+        tokenCount_ = 0;
+        parseErrorOffset_ = i;
+        return;
+    }
+
+    //! Stringify to string or URI fragment representation.
+    /*!
+        \tparam uriFragment True for stringifying to URI fragment representation. False for string representation.
+        \tparam OutputStream type of output stream.
+        \param os The output stream.
+    */
+    template<bool uriFragment, typename OutputStream>
+    bool Stringify(OutputStream& os) const {
+        RAPIDJSON_ASSERT(IsValid());
+
+        if (uriFragment)
+            os.Put('#');
+
+        for (Token *t = tokens_; t != tokens_ + tokenCount_; ++t) {
+            os.Put('/');
+            for (size_t j = 0; j < t->length; j++) {
+                Ch c = t->name[j];
+                if (c == '~') {
+                    os.Put('~');
+                    os.Put('0');
+                }
+                else if (c == '/') {
+                    os.Put('~');
+                    os.Put('1');
+                }
+                else if (uriFragment && NeedPercentEncode(c)) { 
+                    // Transcode to UTF8 sequence
+                    GenericStringStream<typename ValueType::EncodingType> source(&t->name[j]);
+                    PercentEncodeStream<OutputStream> target(os);
+                    if (!Transcoder<EncodingType, UTF8<> >().Validate(source, target))
+                        return false;
+                    j += source.Tell() - 1;
+                }
+                else
+                    os.Put(c);
+            }
+        }
+        return true;
+    }
+
+    //! A helper stream for decoding a percent-encoded sequence into code unit.
+    /*!
+        This stream decodes %XY triplet into code unit (0-255).
+        If it encounters invalid characters, it sets output code unit as 0 and 
+        mark invalid, and to be checked by IsValid().
+    */
+    class PercentDecodeStream {
+    public:
+        typedef typename ValueType::Ch Ch;
+
+        //! Constructor
+        /*!
+            \param source Start of the stream
+            \param end Past-the-end of the stream.
+        */
+        PercentDecodeStream(const Ch* source, const Ch* end) : src_(source), head_(source), end_(end), valid_(true) {}
+
+        Ch Take() {
+            if (*src_ != '%' || src_ + 3 > end_) { // %XY triplet
+                valid_ = false;
+                return 0;
+            }
+            src_++;
+            Ch c = 0;
+            for (int j = 0; j < 2; j++) {
+                c = static_cast<Ch>(c << 4);
+                Ch h = *src_;
+                if      (h >= '0' && h <= '9') c = static_cast<Ch>(c + h - '0');
+                else if (h >= 'A' && h <= 'F') c = static_cast<Ch>(c + h - 'A' + 10);
+                else if (h >= 'a' && h <= 'f') c = static_cast<Ch>(c + h - 'a' + 10);
+                else {
+                    valid_ = false;
+                    return 0;
+                }
+                src_++;
+            }
+            return c;
+        }
+
+        size_t Tell() const { return static_cast<size_t>(src_ - head_); }
+        bool IsValid() const { return valid_; }
+
+    private:
+        const Ch* src_;     //!< Current read position.
+        const Ch* head_;    //!< Original head of the string.
+        const Ch* end_;     //!< Past-the-end position.
+        bool valid_;        //!< Whether the parsing is valid.
+    };
+
+    //! A helper stream to encode character (UTF-8 code unit) into percent-encoded sequence.
+    template <typename OutputStream>
+    class PercentEncodeStream {
+    public:
+        PercentEncodeStream(OutputStream& os) : os_(os) {}
+        void Put(char c) { // UTF-8 must be byte
+            unsigned char u = static_cast<unsigned char>(c);
+            static const char hexDigits[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+            os_.Put('%');
+            os_.Put(static_cast<typename OutputStream::Ch>(hexDigits[u >> 4]));
+            os_.Put(static_cast<typename OutputStream::Ch>(hexDigits[u & 15]));
+        }
+    private:
+        OutputStream& os_;
+    };
+
+    Allocator* allocator_;                  //!< The current allocator. It is either user-supplied or equal to ownAllocator_.
+    Allocator* ownAllocator_;               //!< Allocator owned by this Pointer.
+    Ch* nameBuffer_;                        //!< A buffer containing all names in tokens.
+    Token* tokens_;                         //!< A list of tokens.
+    size_t tokenCount_;                     //!< Number of tokens in tokens_.
+    size_t parseErrorOffset_;               //!< Offset in code unit when parsing fail.
+    PointerParseErrorCode parseErrorCode_;  //!< Parsing error code.
+};
+
+//! GenericPointer for Value (UTF-8, default allocator).
+typedef GenericPointer<Value> Pointer;
+
+//!@name Helper functions for GenericPointer
+//@{
+
+//////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+typename T::ValueType& CreateValueByPointer(T& root, const GenericPointer<typename T::ValueType>& pointer, typename T::AllocatorType& a) {
+    return pointer.Create(root, a);
+}
+
+template <typename T, typename CharType, size_t N>
+typename T::ValueType& CreateValueByPointer(T& root, const CharType(&source)[N], typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Create(root, a);
+}
+
+// No allocator parameter
+
+template <typename DocumentType>
+typename DocumentType::ValueType& CreateValueByPointer(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer) {
+    return pointer.Create(document);
+}
+
+template <typename DocumentType, typename CharType, size_t N>
+typename DocumentType::ValueType& CreateValueByPointer(DocumentType& document, const CharType(&source)[N]) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).Create(document);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+typename T::ValueType* GetValueByPointer(T& root, const GenericPointer<typename T::ValueType>& pointer, size_t* unresolvedTokenIndex = 0) {
+    return pointer.Get(root, unresolvedTokenIndex);
+}
+
+template <typename T>
+const typename T::ValueType* GetValueByPointer(const T& root, const GenericPointer<typename T::ValueType>& pointer, size_t* unresolvedTokenIndex = 0) {
+    return pointer.Get(root, unresolvedTokenIndex);
+}
+
+template <typename T, typename CharType, size_t N>
+typename T::ValueType* GetValueByPointer(T& root, const CharType (&source)[N], size_t* unresolvedTokenIndex = 0) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Get(root, unresolvedTokenIndex);
+}
+
+template <typename T, typename CharType, size_t N>
+const typename T::ValueType* GetValueByPointer(const T& root, const CharType(&source)[N], size_t* unresolvedTokenIndex = 0) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Get(root, unresolvedTokenIndex);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+typename T::ValueType& GetValueByPointerWithDefault(T& root, const GenericPointer<typename T::ValueType>& pointer, const typename T::ValueType& defaultValue, typename T::AllocatorType& a) {
+    return pointer.GetWithDefault(root, defaultValue, a);
+}
+
+template <typename T>
+typename T::ValueType& GetValueByPointerWithDefault(T& root, const GenericPointer<typename T::ValueType>& pointer, const typename T::Ch* defaultValue, typename T::AllocatorType& a) {
+    return pointer.GetWithDefault(root, defaultValue, a);
+}
+
+#if RAPIDJSON_HAS_STDSTRING
+template <typename T>
+typename T::ValueType& GetValueByPointerWithDefault(T& root, const GenericPointer<typename T::ValueType>& pointer, const std::basic_string<typename T::Ch>& defaultValue, typename T::AllocatorType& a) {
+    return pointer.GetWithDefault(root, defaultValue, a);
+}
+#endif
+
+template <typename T, typename T2>
+RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T2>, internal::IsGenericValue<T2> >), (typename T::ValueType&))
+GetValueByPointerWithDefault(T& root, const GenericPointer<typename T::ValueType>& pointer, T2 defaultValue, typename T::AllocatorType& a) {
+    return pointer.GetWithDefault(root, defaultValue, a);
+}
+
+template <typename T, typename CharType, size_t N>
+typename T::ValueType& GetValueByPointerWithDefault(T& root, const CharType(&source)[N], const typename T::ValueType& defaultValue, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).GetWithDefault(root, defaultValue, a);
+}
+
+template <typename T, typename CharType, size_t N>
+typename T::ValueType& GetValueByPointerWithDefault(T& root, const CharType(&source)[N], const typename T::Ch* defaultValue, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).GetWithDefault(root, defaultValue, a);
+}
+
+#if RAPIDJSON_HAS_STDSTRING
+template <typename T, typename CharType, size_t N>
+typename T::ValueType& GetValueByPointerWithDefault(T& root, const CharType(&source)[N], const std::basic_string<typename T::Ch>& defaultValue, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).GetWithDefault(root, defaultValue, a);
+}
+#endif
+
+template <typename T, typename CharType, size_t N, typename T2>
+RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T2>, internal::IsGenericValue<T2> >), (typename T::ValueType&))
+GetValueByPointerWithDefault(T& root, const CharType(&source)[N], T2 defaultValue, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).GetWithDefault(root, defaultValue, a);
+}
+
+// No allocator parameter
+
+template <typename DocumentType>
+typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, const typename DocumentType::ValueType& defaultValue) {
+    return pointer.GetWithDefault(document, defaultValue);
+}
+
+template <typename DocumentType>
+typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, const typename DocumentType::Ch* defaultValue) {
+    return pointer.GetWithDefault(document, defaultValue);
+}
+
+#if RAPIDJSON_HAS_STDSTRING
+template <typename DocumentType>
+typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, const std::basic_string<typename DocumentType::Ch>& defaultValue) {
+    return pointer.GetWithDefault(document, defaultValue);
+}
+#endif
+
+template <typename DocumentType, typename T2>
+RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T2>, internal::IsGenericValue<T2> >), (typename DocumentType::ValueType&))
+GetValueByPointerWithDefault(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, T2 defaultValue) {
+    return pointer.GetWithDefault(document, defaultValue);
+}
+
+template <typename DocumentType, typename CharType, size_t N>
+typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const CharType(&source)[N], const typename DocumentType::ValueType& defaultValue) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).GetWithDefault(document, defaultValue);
+}
+
+template <typename DocumentType, typename CharType, size_t N>
+typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const CharType(&source)[N], const typename DocumentType::Ch* defaultValue) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).GetWithDefault(document, defaultValue);
+}
+
+#if RAPIDJSON_HAS_STDSTRING
+template <typename DocumentType, typename CharType, size_t N>
+typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const CharType(&source)[N], const std::basic_string<typename DocumentType::Ch>& defaultValue) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).GetWithDefault(document, defaultValue);
+}
+#endif
+
+template <typename DocumentType, typename CharType, size_t N, typename T2>
+RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T2>, internal::IsGenericValue<T2> >), (typename DocumentType::ValueType&))
+GetValueByPointerWithDefault(DocumentType& document, const CharType(&source)[N], T2 defaultValue) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).GetWithDefault(document, defaultValue);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+typename T::ValueType& SetValueByPointer(T& root, const GenericPointer<typename T::ValueType>& pointer, typename T::ValueType& value, typename T::AllocatorType& a) {
+    return pointer.Set(root, value, a);
+}
+
+template <typename T>
+typename T::ValueType& SetValueByPointer(T& root, const GenericPointer<typename T::ValueType>& pointer, const typename T::ValueType& value, typename T::AllocatorType& a) {
+    return pointer.Set(root, value, a);
+}
+
+template <typename T>
+typename T::ValueType& SetValueByPointer(T& root, const GenericPointer<typename T::ValueType>& pointer, const typename T::Ch* value, typename T::AllocatorType& a) {
+    return pointer.Set(root, value, a);
+}
+
+#if RAPIDJSON_HAS_STDSTRING
+template <typename T>
+typename T::ValueType& SetValueByPointer(T& root, const GenericPointer<typename T::ValueType>& pointer, const std::basic_string<typename T::Ch>& value, typename T::AllocatorType& a) {
+    return pointer.Set(root, value, a);
+}
+#endif
+
+template <typename T, typename T2>
+RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T2>, internal::IsGenericValue<T2> >), (typename T::ValueType&))
+SetValueByPointer(T& root, const GenericPointer<typename T::ValueType>& pointer, T2 value, typename T::AllocatorType& a) {
+    return pointer.Set(root, value, a);
+}
+
+template <typename T, typename CharType, size_t N>
+typename T::ValueType& SetValueByPointer(T& root, const CharType(&source)[N], typename T::ValueType& value, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Set(root, value, a);
+}
+
+template <typename T, typename CharType, size_t N>
+typename T::ValueType& SetValueByPointer(T& root, const CharType(&source)[N], const typename T::ValueType& value, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Set(root, value, a);
+}
+
+template <typename T, typename CharType, size_t N>
+typename T::ValueType& SetValueByPointer(T& root, const CharType(&source)[N], const typename T::Ch* value, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Set(root, value, a);
+}
+
+#if RAPIDJSON_HAS_STDSTRING
+template <typename T, typename CharType, size_t N>
+typename T::ValueType& SetValueByPointer(T& root, const CharType(&source)[N], const std::basic_string<typename T::Ch>& value, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Set(root, value, a);
+}
+#endif
+
+template <typename T, typename CharType, size_t N, typename T2>
+RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T2>, internal::IsGenericValue<T2> >), (typename T::ValueType&))
+SetValueByPointer(T& root, const CharType(&source)[N], T2 value, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Set(root, value, a);
+}
+
+// No allocator parameter
+
+template <typename DocumentType>
+typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, typename DocumentType::ValueType& value) {
+    return pointer.Set(document, value);
+}
+
+template <typename DocumentType>
+typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, const typename DocumentType::ValueType& value) {
+    return pointer.Set(document, value);
+}
+
+template <typename DocumentType>
+typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, const typename DocumentType::Ch* value) {
+    return pointer.Set(document, value);
+}
+
+#if RAPIDJSON_HAS_STDSTRING
+template <typename DocumentType>
+typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, const std::basic_string<typename DocumentType::Ch>& value) {
+    return pointer.Set(document, value);
+}
+#endif
+
+template <typename DocumentType, typename T2>
+RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T2>, internal::IsGenericValue<T2> >), (typename DocumentType::ValueType&))
+SetValueByPointer(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, T2 value) {
+    return pointer.Set(document, value);
+}
+
+template <typename DocumentType, typename CharType, size_t N>
+typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const CharType(&source)[N], typename DocumentType::ValueType& value) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).Set(document, value);
+}
+
+template <typename DocumentType, typename CharType, size_t N>
+typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const CharType(&source)[N], const typename DocumentType::ValueType& value) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).Set(document, value);
+}
+
+template <typename DocumentType, typename CharType, size_t N>
+typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const CharType(&source)[N], const typename DocumentType::Ch* value) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).Set(document, value);
+}
+
+#if RAPIDJSON_HAS_STDSTRING
+template <typename DocumentType, typename CharType, size_t N>
+typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const CharType(&source)[N], const std::basic_string<typename DocumentType::Ch>& value) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).Set(document, value);
+}
+#endif
+
+template <typename DocumentType, typename CharType, size_t N, typename T2>
+RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T2>, internal::IsGenericValue<T2> >), (typename DocumentType::ValueType&))
+SetValueByPointer(DocumentType& document, const CharType(&source)[N], T2 value) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).Set(document, value);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+typename T::ValueType& SwapValueByPointer(T& root, const GenericPointer<typename T::ValueType>& pointer, typename T::ValueType& value, typename T::AllocatorType& a) {
+    return pointer.Swap(root, value, a);
+}
+
+template <typename T, typename CharType, size_t N>
+typename T::ValueType& SwapValueByPointer(T& root, const CharType(&source)[N], typename T::ValueType& value, typename T::AllocatorType& a) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Swap(root, value, a);
+}
+
+template <typename DocumentType>
+typename DocumentType::ValueType& SwapValueByPointer(DocumentType& document, const GenericPointer<typename DocumentType::ValueType>& pointer, typename DocumentType::ValueType& value) {
+    return pointer.Swap(document, value);
+}
+
+template <typename DocumentType, typename CharType, size_t N>
+typename DocumentType::ValueType& SwapValueByPointer(DocumentType& document, const CharType(&source)[N], typename DocumentType::ValueType& value) {
+    return GenericPointer<typename DocumentType::ValueType>(source, N - 1).Swap(document, value);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+bool EraseValueByPointer(T& root, const GenericPointer<typename T::ValueType>& pointer) {
+    return pointer.Erase(root);
+}
+
+template <typename T, typename CharType, size_t N>
+bool EraseValueByPointer(T& root, const CharType(&source)[N]) {
+    return GenericPointer<typename T::ValueType>(source, N - 1).Erase(root);
+}
+
+//@}
+
+RAPIDJSON_NAMESPACE_END
+
+#if defined(__clang__) || defined(_MSC_VER)
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_POINTER_H_
diff --git a/include/rapidjson/prettywriter.h b/include/rapidjson/prettywriter.h
new file mode 100644
index 0000000000..fe45df1d10
--- /dev/null
+++ b/include/rapidjson/prettywriter.h
@@ -0,0 +1,277 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_PRETTYWRITER_H_
+#define RAPIDJSON_PRETTYWRITER_H_
+
+#include "writer.h"
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++)
+#endif
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(c++98-compat)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Combination of PrettyWriter format flags.
+/*! \see PrettyWriter::SetFormatOptions
+ */
+enum PrettyFormatOptions {
+    kFormatDefault = 0,         //!< Default pretty formatting.
+    kFormatSingleLineArray = 1  //!< Format arrays on a single line.
+};
+
+//! Writer with indentation and spacing.
+/*!
+    \tparam OutputStream Type of output os.
+    \tparam SourceEncoding Encoding of source string.
+    \tparam TargetEncoding Encoding of output stream.
+    \tparam StackAllocator Type of allocator for allocating memory of stack.
+*/
+template<typename OutputStream, typename SourceEncoding = UTF8<>, typename TargetEncoding = UTF8<>, typename StackAllocator = CrtAllocator, unsigned writeFlags = kWriteDefaultFlags>
+class PrettyWriter : public Writer<OutputStream, SourceEncoding, TargetEncoding, StackAllocator, writeFlags> {
+public:
+    typedef Writer<OutputStream, SourceEncoding, TargetEncoding, StackAllocator, writeFlags> Base;
+    typedef typename Base::Ch Ch;
+
+    //! Constructor
+    /*! \param os Output stream.
+        \param allocator User supplied allocator. If it is null, it will create a private one.
+        \param levelDepth Initial capacity of stack.
+    */
+    explicit PrettyWriter(OutputStream& os, StackAllocator* allocator = 0, size_t levelDepth = Base::kDefaultLevelDepth) : 
+        Base(os, allocator, levelDepth), indentChar_(' '), indentCharCount_(4), formatOptions_(kFormatDefault) {}
+
+
+    explicit PrettyWriter(StackAllocator* allocator = 0, size_t levelDepth = Base::kDefaultLevelDepth) : 
+        Base(allocator, levelDepth), indentChar_(' '), indentCharCount_(4), formatOptions_(kFormatDefault) {}
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    PrettyWriter(PrettyWriter&& rhs) :
+        Base(std::forward<PrettyWriter>(rhs)), indentChar_(rhs.indentChar_), indentCharCount_(rhs.indentCharCount_), formatOptions_(rhs.formatOptions_) {}
+#endif
+
+    //! Set custom indentation.
+    /*! \param indentChar       Character for indentation. Must be whitespace character (' ', '\\t', '\\n', '\\r').
+        \param indentCharCount  Number of indent characters for each indentation level.
+        \note The default indentation is 4 spaces.
+    */
+    PrettyWriter& SetIndent(Ch indentChar, unsigned indentCharCount) {
+        RAPIDJSON_ASSERT(indentChar == ' ' || indentChar == '\t' || indentChar == '\n' || indentChar == '\r');
+        indentChar_ = indentChar;
+        indentCharCount_ = indentCharCount;
+        return *this;
+    }
+
+    //! Set pretty writer formatting options.
+    /*! \param options Formatting options.
+    */
+    PrettyWriter& SetFormatOptions(PrettyFormatOptions options) {
+        formatOptions_ = options;
+        return *this;
+    }
+
+    /*! @name Implementation of Handler
+        \see Handler
+    */
+    //@{
+
+    bool Null()                 { PrettyPrefix(kNullType);   return Base::EndValue(Base::WriteNull()); }
+    bool Bool(bool b)           { PrettyPrefix(b ? kTrueType : kFalseType); return Base::EndValue(Base::WriteBool(b)); }
+    bool Int(int i)             { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteInt(i)); }
+    bool Uint(unsigned u)       { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteUint(u)); }
+    bool Int64(int64_t i64)     { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteInt64(i64)); }
+    bool Uint64(uint64_t u64)   { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteUint64(u64));  }
+    bool Double(double d)       { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteDouble(d)); }
+
+    bool RawNumber(const Ch* str, SizeType length, bool copy = false) {
+        RAPIDJSON_ASSERT(str != 0);
+        (void)copy;
+        PrettyPrefix(kNumberType);
+        return Base::EndValue(Base::WriteString(str, length));
+    }
+
+    bool String(const Ch* str, SizeType length, bool copy = false) {
+        RAPIDJSON_ASSERT(str != 0);
+        (void)copy;
+        PrettyPrefix(kStringType);
+        return Base::EndValue(Base::WriteString(str, length));
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    bool String(const std::basic_string<Ch>& str) {
+        return String(str.data(), SizeType(str.size()));
+    }
+#endif
+
+    bool StartObject() {
+        PrettyPrefix(kObjectType);
+        new (Base::level_stack_.template Push<typename Base::Level>()) typename Base::Level(false);
+        return Base::WriteStartObject();
+    }
+
+    bool Key(const Ch* str, SizeType length, bool copy = false) { return String(str, length, copy); }
+
+#if RAPIDJSON_HAS_STDSTRING
+    bool Key(const std::basic_string<Ch>& str) {
+        return Key(str.data(), SizeType(str.size()));
+    }
+#endif
+	
+    bool EndObject(SizeType memberCount = 0) {
+        (void)memberCount;
+        RAPIDJSON_ASSERT(Base::level_stack_.GetSize() >= sizeof(typename Base::Level)); // not inside an Object
+        RAPIDJSON_ASSERT(!Base::level_stack_.template Top<typename Base::Level>()->inArray); // currently inside an Array, not Object
+        RAPIDJSON_ASSERT(0 == Base::level_stack_.template Top<typename Base::Level>()->valueCount % 2); // Object has a Key without a Value
+       
+        bool empty = Base::level_stack_.template Pop<typename Base::Level>(1)->valueCount == 0;
+
+        if (!empty) {
+            Base::os_->Put('\n');
+            WriteIndent();
+        }
+        bool ret = Base::EndValue(Base::WriteEndObject());
+        (void)ret;
+        RAPIDJSON_ASSERT(ret == true);
+        if (Base::level_stack_.Empty()) // end of json text
+            Base::Flush();
+        return true;
+    }
+
+    bool StartArray() {
+        PrettyPrefix(kArrayType);
+        new (Base::level_stack_.template Push<typename Base::Level>()) typename Base::Level(true);
+        return Base::WriteStartArray();
+    }
+
+    bool EndArray(SizeType memberCount = 0) {
+        (void)memberCount;
+        RAPIDJSON_ASSERT(Base::level_stack_.GetSize() >= sizeof(typename Base::Level));
+        RAPIDJSON_ASSERT(Base::level_stack_.template Top<typename Base::Level>()->inArray);
+        bool empty = Base::level_stack_.template Pop<typename Base::Level>(1)->valueCount == 0;
+
+        if (!empty && !(formatOptions_ & kFormatSingleLineArray)) {
+            Base::os_->Put('\n');
+            WriteIndent();
+        }
+        bool ret = Base::EndValue(Base::WriteEndArray());
+        (void)ret;
+        RAPIDJSON_ASSERT(ret == true);
+        if (Base::level_stack_.Empty()) // end of json text
+            Base::Flush();
+        return true;
+    }
+
+    //@}
+
+    /*! @name Convenience extensions */
+    //@{
+
+    //! Simpler but slower overload.
+    bool String(const Ch* str) { return String(str, internal::StrLen(str)); }
+    bool Key(const Ch* str) { return Key(str, internal::StrLen(str)); }
+
+    //@}
+
+    //! Write a raw JSON value.
+    /*!
+        For user to write a stringified JSON as a value.
+
+        \param json A well-formed JSON value. It should not contain null character within [0, length - 1] range.
+        \param length Length of the json.
+        \param type Type of the root of json.
+        \note When using PrettyWriter::RawValue(), the result json may not be indented correctly.
+    */
+    bool RawValue(const Ch* json, size_t length, Type type) {
+        RAPIDJSON_ASSERT(json != 0);
+        PrettyPrefix(type);
+        return Base::EndValue(Base::WriteRawValue(json, length));
+    }
+
+protected:
+    void PrettyPrefix(Type type) {
+        (void)type;
+        if (Base::level_stack_.GetSize() != 0) { // this value is not at root
+            typename Base::Level* level = Base::level_stack_.template Top<typename Base::Level>();
+
+            if (level->inArray) {
+                if (level->valueCount > 0) {
+                    Base::os_->Put(','); // add comma if it is not the first element in array
+                    if (formatOptions_ & kFormatSingleLineArray)
+                        Base::os_->Put(' ');
+                }
+
+                if (!(formatOptions_ & kFormatSingleLineArray)) {
+                    Base::os_->Put('\n');
+                    WriteIndent();
+                }
+            }
+            else {  // in object
+                if (level->valueCount > 0) {
+                    if (level->valueCount % 2 == 0) {
+                        Base::os_->Put(',');
+                        Base::os_->Put('\n');
+                    }
+                    else {
+                        Base::os_->Put(':');
+                        Base::os_->Put(' ');
+                    }
+                }
+                else
+                    Base::os_->Put('\n');
+
+                if (level->valueCount % 2 == 0)
+                    WriteIndent();
+            }
+            if (!level->inArray && level->valueCount % 2 == 0)
+                RAPIDJSON_ASSERT(type == kStringType);  // if it's in object, then even number should be a name
+            level->valueCount++;
+        }
+        else {
+            RAPIDJSON_ASSERT(!Base::hasRoot_);  // Should only has one and only one root.
+            Base::hasRoot_ = true;
+        }
+    }
+
+    void WriteIndent()  {
+        size_t count = (Base::level_stack_.GetSize() / sizeof(typename Base::Level)) * indentCharCount_;
+        PutN(*Base::os_, static_cast<typename OutputStream::Ch>(indentChar_), count);
+    }
+
+    Ch indentChar_;
+    unsigned indentCharCount_;
+    PrettyFormatOptions formatOptions_;
+
+private:
+    // Prohibit copy constructor & assignment operator.
+    PrettyWriter(const PrettyWriter&);
+    PrettyWriter& operator=(const PrettyWriter&);
+};
+
+RAPIDJSON_NAMESPACE_END
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_POP
+#endif
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_RAPIDJSON_H_
diff --git a/include/rapidjson/rapidjson.h b/include/rapidjson/rapidjson.h
new file mode 100644
index 0000000000..247b8e68db
--- /dev/null
+++ b/include/rapidjson/rapidjson.h
@@ -0,0 +1,741 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_RAPIDJSON_H_
+#define RAPIDJSON_RAPIDJSON_H_
+
+/*!\file rapidjson.h
+    \brief common definitions and configuration
+
+    \see RAPIDJSON_CONFIG
+ */
+
+/*! \defgroup RAPIDJSON_CONFIG RapidJSON configuration
+    \brief Configuration macros for library features
+
+    Some RapidJSON features are configurable to adapt the library to a wide
+    variety of platforms, environments and usage scenarios.  Most of the
+    features can be configured in terms of overridden or predefined
+    preprocessor macros at compile-time.
+
+    Some additional customization is available in the \ref RAPIDJSON_ERRORS APIs.
+
+    \note These macros should be given on the compiler command-line
+          (where applicable)  to avoid inconsistent values when compiling
+          different translation units of a single application.
+ */
+
+#include <cstdlib>  // malloc(), realloc(), free(), size_t
+#include <cstring>  // memset(), memcpy(), memmove(), memcmp()
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_VERSION_STRING
+//
+// ALWAYS synchronize the following 3 macros with corresponding variables in /CMakeLists.txt.
+//
+
+//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
+// token stringification
+#define RAPIDJSON_STRINGIFY(x) RAPIDJSON_DO_STRINGIFY(x)
+#define RAPIDJSON_DO_STRINGIFY(x) #x
+
+// token concatenation
+#define RAPIDJSON_JOIN(X, Y) RAPIDJSON_DO_JOIN(X, Y)
+#define RAPIDJSON_DO_JOIN(X, Y) RAPIDJSON_DO_JOIN2(X, Y)
+#define RAPIDJSON_DO_JOIN2(X, Y) X##Y
+//!@endcond
+
+/*! \def RAPIDJSON_MAJOR_VERSION
+    \ingroup RAPIDJSON_CONFIG
+    \brief Major version of RapidJSON in integer.
+*/
+/*! \def RAPIDJSON_MINOR_VERSION
+    \ingroup RAPIDJSON_CONFIG
+    \brief Minor version of RapidJSON in integer.
+*/
+/*! \def RAPIDJSON_PATCH_VERSION
+    \ingroup RAPIDJSON_CONFIG
+    \brief Patch version of RapidJSON in integer.
+*/
+/*! \def RAPIDJSON_VERSION_STRING
+    \ingroup RAPIDJSON_CONFIG
+    \brief Version of RapidJSON in "<major>.<minor>.<patch>" string format.
+*/
+#define RAPIDJSON_MAJOR_VERSION 1
+#define RAPIDJSON_MINOR_VERSION 1
+#define RAPIDJSON_PATCH_VERSION 0
+#define RAPIDJSON_VERSION_STRING \
+    RAPIDJSON_STRINGIFY(RAPIDJSON_MAJOR_VERSION.RAPIDJSON_MINOR_VERSION.RAPIDJSON_PATCH_VERSION)
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_NAMESPACE_(BEGIN|END)
+/*! \def RAPIDJSON_NAMESPACE
+    \ingroup RAPIDJSON_CONFIG
+    \brief   provide custom rapidjson namespace
+
+    In order to avoid symbol clashes and/or "One Definition Rule" errors
+    between multiple inclusions of (different versions of) RapidJSON in
+    a single binary, users can customize the name of the main RapidJSON
+    namespace.
+
+    In case of a single nesting level, defining \c RAPIDJSON_NAMESPACE
+    to a custom name (e.g. \c MyRapidJSON) is sufficient.  If multiple
+    levels are needed, both \ref RAPIDJSON_NAMESPACE_BEGIN and \ref
+    RAPIDJSON_NAMESPACE_END need to be defined as well:
+
+    \code
+    // in some .cpp file
+    #define RAPIDJSON_NAMESPACE my::rapidjson
+    #define RAPIDJSON_NAMESPACE_BEGIN namespace my { namespace rapidjson {
+    #define RAPIDJSON_NAMESPACE_END   } }
+    #include "rapidjson/..."
+    \endcode
+
+    \see rapidjson
+ */
+/*! \def RAPIDJSON_NAMESPACE_BEGIN
+    \ingroup RAPIDJSON_CONFIG
+    \brief   provide custom rapidjson namespace (opening expression)
+    \see RAPIDJSON_NAMESPACE
+*/
+/*! \def RAPIDJSON_NAMESPACE_END
+    \ingroup RAPIDJSON_CONFIG
+    \brief   provide custom rapidjson namespace (closing expression)
+    \see RAPIDJSON_NAMESPACE
+*/
+#ifndef RAPIDJSON_NAMESPACE
+#define RAPIDJSON_NAMESPACE rapidjson
+#endif
+#ifndef RAPIDJSON_NAMESPACE_BEGIN
+#define RAPIDJSON_NAMESPACE_BEGIN namespace RAPIDJSON_NAMESPACE {
+#endif
+#ifndef RAPIDJSON_NAMESPACE_END
+#define RAPIDJSON_NAMESPACE_END }
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// __cplusplus macro
+
+//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
+
+#if defined(_MSC_VER)
+#define RAPIDJSON_CPLUSPLUS _MSVC_LANG
+#else
+#define RAPIDJSON_CPLUSPLUS __cplusplus
+#endif
+
+//!@endcond
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_HAS_STDSTRING
+
+#ifndef RAPIDJSON_HAS_STDSTRING
+#ifdef RAPIDJSON_DOXYGEN_RUNNING
+#define RAPIDJSON_HAS_STDSTRING 1 // force generation of documentation
+#else
+#define RAPIDJSON_HAS_STDSTRING 0 // no std::string support by default
+#endif
+/*! \def RAPIDJSON_HAS_STDSTRING
+    \ingroup RAPIDJSON_CONFIG
+    \brief Enable RapidJSON support for \c std::string
+
+    By defining this preprocessor symbol to \c 1, several convenience functions for using
+    \ref rapidjson::GenericValue with \c std::string are enabled, especially
+    for construction and comparison.
+
+    \hideinitializer
+*/
+#endif // !defined(RAPIDJSON_HAS_STDSTRING)
+
+#if RAPIDJSON_HAS_STDSTRING
+#include <string>
+#endif // RAPIDJSON_HAS_STDSTRING
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_USE_MEMBERSMAP
+
+/*! \def RAPIDJSON_USE_MEMBERSMAP
+    \ingroup RAPIDJSON_CONFIG
+    \brief Enable RapidJSON support for object members handling in a \c std::multimap
+
+    By defining this preprocessor symbol to \c 1, \ref rapidjson::GenericValue object
+    members are stored in a \c std::multimap for faster lookup and deletion times, a
+    trade off with a slightly slower insertion time and a small object allocat(or)ed
+    memory overhead.
+
+    \hideinitializer
+*/
+#ifndef RAPIDJSON_USE_MEMBERSMAP
+#define RAPIDJSON_USE_MEMBERSMAP 0 // not by default
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_NO_INT64DEFINE
+
+/*! \def RAPIDJSON_NO_INT64DEFINE
+    \ingroup RAPIDJSON_CONFIG
+    \brief Use external 64-bit integer types.
+
+    RapidJSON requires the 64-bit integer types \c int64_t and  \c uint64_t types
+    to be available at global scope.
+
+    If users have their own definition, define RAPIDJSON_NO_INT64DEFINE to
+    prevent RapidJSON from defining its own types.
+*/
+#ifndef RAPIDJSON_NO_INT64DEFINE
+//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
+#if defined(_MSC_VER) && (_MSC_VER < 1800) // Visual Studio 2013
+#include "msinttypes/stdint.h"
+#include "msinttypes/inttypes.h"
+#else
+// Other compilers should have this.
+#include <stdint.h>
+#include <inttypes.h>
+#endif
+//!@endcond
+#ifdef RAPIDJSON_DOXYGEN_RUNNING
+#define RAPIDJSON_NO_INT64DEFINE
+#endif
+#endif // RAPIDJSON_NO_INT64TYPEDEF
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_FORCEINLINE
+
+#ifndef RAPIDJSON_FORCEINLINE
+//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
+#if defined(_MSC_VER) && defined(NDEBUG)
+#define RAPIDJSON_FORCEINLINE __forceinline
+#elif defined(__GNUC__) && __GNUC__ >= 4 && defined(NDEBUG)
+#define RAPIDJSON_FORCEINLINE __attribute__((always_inline))
+#else
+#define RAPIDJSON_FORCEINLINE
+#endif
+//!@endcond
+#endif // RAPIDJSON_FORCEINLINE
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_ENDIAN
+#define RAPIDJSON_LITTLEENDIAN  0   //!< Little endian machine
+#define RAPIDJSON_BIGENDIAN     1   //!< Big endian machine
+
+//! Endianness of the machine.
+/*!
+    \def RAPIDJSON_ENDIAN
+    \ingroup RAPIDJSON_CONFIG
+
+    GCC 4.6 provided macro for detecting endianness of the target machine. But other
+    compilers may not have this. User can define RAPIDJSON_ENDIAN to either
+    \ref RAPIDJSON_LITTLEENDIAN or \ref RAPIDJSON_BIGENDIAN.
+
+    Default detection implemented with reference to
+    \li https://gcc.gnu.org/onlinedocs/gcc-4.6.0/cpp/Common-Predefined-Macros.html
+    \li http://www.boost.org/doc/libs/1_42_0/boost/detail/endian.hpp
+*/
+#ifndef RAPIDJSON_ENDIAN
+// Detect with GCC 4.6's macro
+#  ifdef __BYTE_ORDER__
+#    if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#      define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN
+#    elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#      define RAPIDJSON_ENDIAN RAPIDJSON_BIGENDIAN
+#    else
+#      error Unknown machine endianness detected. User needs to define RAPIDJSON_ENDIAN.
+#    endif // __BYTE_ORDER__
+// Detect with GLIBC's endian.h
+#  elif defined(__GLIBC__)
+#    include <endian.h>
+#    if (__BYTE_ORDER == __LITTLE_ENDIAN)
+#      define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN
+#    elif (__BYTE_ORDER == __BIG_ENDIAN)
+#      define RAPIDJSON_ENDIAN RAPIDJSON_BIGENDIAN
+#    else
+#      error Unknown machine endianness detected. User needs to define RAPIDJSON_ENDIAN.
+#   endif // __GLIBC__
+// Detect with _LITTLE_ENDIAN and _BIG_ENDIAN macro
+#  elif defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)
+#    define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN
+#  elif defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)
+#    define RAPIDJSON_ENDIAN RAPIDJSON_BIGENDIAN
+// Detect with architecture macros
+#  elif defined(__sparc) || defined(__sparc__) || defined(_POWER) || defined(__powerpc__) || defined(__ppc__) || defined(__ppc64__) || defined(__hpux) || defined(__hppa) || defined(_MIPSEB) || defined(_POWER) || defined(__s390__)
+#    define RAPIDJSON_ENDIAN RAPIDJSON_BIGENDIAN
+#  elif defined(__i386__) || defined(__alpha__) || defined(__ia64) || defined(__ia64__) || defined(_M_IX86) || defined(_M_IA64) || defined(_M_ALPHA) || defined(__amd64) || defined(__amd64__) || defined(_M_AMD64) || defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || defined(__bfin__)
+#    define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN
+#  elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#    define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN
+#  elif defined(RAPIDJSON_DOXYGEN_RUNNING)
+#    define RAPIDJSON_ENDIAN
+#  else
+#    error Unknown machine endianness detected. User needs to define RAPIDJSON_ENDIAN.
+#  endif
+#endif // RAPIDJSON_ENDIAN
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_64BIT
+
+//! Whether using 64-bit architecture
+#ifndef RAPIDJSON_64BIT
+#if defined(__LP64__) || (defined(__x86_64__) && defined(__ILP32__)) || defined(_WIN64) || defined(__EMSCRIPTEN__)
+#define RAPIDJSON_64BIT 1
+#else
+#define RAPIDJSON_64BIT 0
+#endif
+#endif // RAPIDJSON_64BIT
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_ALIGN
+
+//! Data alignment of the machine.
+/*! \ingroup RAPIDJSON_CONFIG
+    \param x pointer to align
+
+    Some machines require strict data alignment. The default is 8 bytes.
+    User can customize by defining the RAPIDJSON_ALIGN function macro.
+*/
+#ifndef RAPIDJSON_ALIGN
+#define RAPIDJSON_ALIGN(x) (((x) + static_cast<size_t>(7u)) & ~static_cast<size_t>(7u))
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_UINT64_C2
+
+//! Construct a 64-bit literal by a pair of 32-bit integer.
+/*!
+    64-bit literal with or without ULL suffix is prone to compiler warnings.
+    UINT64_C() is C macro which cause compilation problems.
+    Use this macro to define 64-bit constants by a pair of 32-bit integer.
+*/
+#ifndef RAPIDJSON_UINT64_C2
+#define RAPIDJSON_UINT64_C2(high32, low32) ((static_cast<uint64_t>(high32) << 32) | static_cast<uint64_t>(low32))
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_48BITPOINTER_OPTIMIZATION
+
+//! Use only lower 48-bit address for some pointers.
+/*!
+    \ingroup RAPIDJSON_CONFIG
+
+    This optimization uses the fact that current X86-64 architecture only implement lower 48-bit virtual address.
+    The higher 16-bit can be used for storing other data.
+    \c GenericValue uses this optimization to reduce its size form 24 bytes to 16 bytes in 64-bit architecture.
+*/
+#ifndef RAPIDJSON_48BITPOINTER_OPTIMIZATION
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+#define RAPIDJSON_48BITPOINTER_OPTIMIZATION 1
+#else
+#define RAPIDJSON_48BITPOINTER_OPTIMIZATION 0
+#endif
+#endif // RAPIDJSON_48BITPOINTER_OPTIMIZATION
+
+#if RAPIDJSON_48BITPOINTER_OPTIMIZATION == 1
+#if RAPIDJSON_64BIT != 1
+#error RAPIDJSON_48BITPOINTER_OPTIMIZATION can only be set to 1 when RAPIDJSON_64BIT=1
+#endif
+#define RAPIDJSON_SETPOINTER(type, p, x) (p = reinterpret_cast<type *>((reinterpret_cast<uintptr_t>(p) & static_cast<uintptr_t>(RAPIDJSON_UINT64_C2(0xFFFF0000, 0x00000000))) | reinterpret_cast<uintptr_t>(reinterpret_cast<const void*>(x))))
+#define RAPIDJSON_GETPOINTER(type, p) (reinterpret_cast<type *>(reinterpret_cast<uintptr_t>(p) & static_cast<uintptr_t>(RAPIDJSON_UINT64_C2(0x0000FFFF, 0xFFFFFFFF))))
+#else
+#define RAPIDJSON_SETPOINTER(type, p, x) (p = (x))
+#define RAPIDJSON_GETPOINTER(type, p) (p)
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_SSE2/RAPIDJSON_SSE42/RAPIDJSON_NEON/RAPIDJSON_SIMD
+
+/*! \def RAPIDJSON_SIMD
+    \ingroup RAPIDJSON_CONFIG
+    \brief Enable SSE2/SSE4.2/Neon optimization.
+
+    RapidJSON supports optimized implementations for some parsing operations
+    based on the SSE2, SSE4.2 or NEon SIMD extensions on modern Intel
+    or ARM compatible processors.
+
+    To enable these optimizations, three different symbols can be defined;
+    \code
+    // Enable SSE2 optimization.
+    #define RAPIDJSON_SSE2
+
+    // Enable SSE4.2 optimization.
+    #define RAPIDJSON_SSE42
+    \endcode
+
+    // Enable ARM Neon optimization.
+    #define RAPIDJSON_NEON
+    \endcode
+
+    \c RAPIDJSON_SSE42 takes precedence over SSE2, if both are defined.
+
+    If any of these symbols is defined, RapidJSON defines the macro
+    \c RAPIDJSON_SIMD to indicate the availability of the optimized code.
+*/
+#if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) \
+    || defined(RAPIDJSON_NEON) || defined(RAPIDJSON_DOXYGEN_RUNNING)
+#define RAPIDJSON_SIMD
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_NO_SIZETYPEDEFINE
+
+#ifndef RAPIDJSON_NO_SIZETYPEDEFINE
+/*! \def RAPIDJSON_NO_SIZETYPEDEFINE
+    \ingroup RAPIDJSON_CONFIG
+    \brief User-provided \c SizeType definition.
+
+    In order to avoid using 32-bit size types for indexing strings and arrays,
+    define this preprocessor symbol and provide the type rapidjson::SizeType
+    before including RapidJSON:
+    \code
+    #define RAPIDJSON_NO_SIZETYPEDEFINE
+    namespace rapidjson { typedef ::std::size_t SizeType; }
+    #include "rapidjson/..."
+    \endcode
+
+    \see rapidjson::SizeType
+*/
+#ifdef RAPIDJSON_DOXYGEN_RUNNING
+#define RAPIDJSON_NO_SIZETYPEDEFINE
+#endif
+RAPIDJSON_NAMESPACE_BEGIN
+//! Size type (for string lengths, array sizes, etc.)
+/*! RapidJSON uses 32-bit array/string indices even on 64-bit platforms,
+    instead of using \c size_t. Users may override the SizeType by defining
+    \ref RAPIDJSON_NO_SIZETYPEDEFINE.
+*/
+typedef unsigned SizeType;
+RAPIDJSON_NAMESPACE_END
+#endif
+
+// always import std::size_t to rapidjson namespace
+RAPIDJSON_NAMESPACE_BEGIN
+using std::size_t;
+RAPIDJSON_NAMESPACE_END
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_ASSERT
+
+//! Assertion.
+/*! \ingroup RAPIDJSON_CONFIG
+    By default, rapidjson uses C \c assert() for internal assertions.
+    User can override it by defining RAPIDJSON_ASSERT(x) macro.
+
+    \note Parsing errors are handled and can be customized by the
+          \ref RAPIDJSON_ERRORS APIs.
+*/
+#ifndef RAPIDJSON_ASSERT
+#include <cassert>
+#define RAPIDJSON_ASSERT(x) assert(x)
+#endif // RAPIDJSON_ASSERT
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_STATIC_ASSERT
+
+// Prefer C++11 static_assert, if available
+#ifndef RAPIDJSON_STATIC_ASSERT
+#if RAPIDJSON_CPLUSPLUS >= 201103L || ( defined(_MSC_VER) && _MSC_VER >= 1800 )
+#define RAPIDJSON_STATIC_ASSERT(x) \
+   static_assert(x, RAPIDJSON_STRINGIFY(x))
+#endif // C++11
+#endif // RAPIDJSON_STATIC_ASSERT
+
+// Adopt C++03 implementation from boost
+#ifndef RAPIDJSON_STATIC_ASSERT
+#ifndef __clang__
+//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
+#endif
+RAPIDJSON_NAMESPACE_BEGIN
+template <bool x> struct STATIC_ASSERTION_FAILURE;
+template <> struct STATIC_ASSERTION_FAILURE<true> { enum { value = 1 }; };
+template <size_t x> struct StaticAssertTest {};
+RAPIDJSON_NAMESPACE_END
+
+#if defined(__GNUC__) || defined(__clang__)
+#define RAPIDJSON_STATIC_ASSERT_UNUSED_ATTRIBUTE __attribute__((unused))
+#else
+#define RAPIDJSON_STATIC_ASSERT_UNUSED_ATTRIBUTE 
+#endif
+#ifndef __clang__
+//!@endcond
+#endif
+
+/*! \def RAPIDJSON_STATIC_ASSERT
+    \brief (Internal) macro to check for conditions at compile-time
+    \param x compile-time condition
+    \hideinitializer
+ */
+#define RAPIDJSON_STATIC_ASSERT(x) \
+    typedef ::RAPIDJSON_NAMESPACE::StaticAssertTest< \
+      sizeof(::RAPIDJSON_NAMESPACE::STATIC_ASSERTION_FAILURE<bool(x) >)> \
+    RAPIDJSON_JOIN(StaticAssertTypedef, __LINE__) RAPIDJSON_STATIC_ASSERT_UNUSED_ATTRIBUTE
+#endif // RAPIDJSON_STATIC_ASSERT
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_LIKELY, RAPIDJSON_UNLIKELY
+
+//! Compiler branching hint for expression with high probability to be true.
+/*!
+    \ingroup RAPIDJSON_CONFIG
+    \param x Boolean expression likely to be true.
+*/
+#ifndef RAPIDJSON_LIKELY
+#if defined(__GNUC__) || defined(__clang__)
+#define RAPIDJSON_LIKELY(x) __builtin_expect(!!(x), 1)
+#else
+#define RAPIDJSON_LIKELY(x) (x)
+#endif
+#endif
+
+//! Compiler branching hint for expression with low probability to be true.
+/*!
+    \ingroup RAPIDJSON_CONFIG
+    \param x Boolean expression unlikely to be true.
+*/
+#ifndef RAPIDJSON_UNLIKELY
+#if defined(__GNUC__) || defined(__clang__)
+#define RAPIDJSON_UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define RAPIDJSON_UNLIKELY(x) (x)
+#endif
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// Helpers
+
+//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
+
+#define RAPIDJSON_MULTILINEMACRO_BEGIN do {
+#define RAPIDJSON_MULTILINEMACRO_END \
+} while((void)0, 0)
+
+// adopted from Boost
+#define RAPIDJSON_VERSION_CODE(x,y,z) \
+  (((x)*100000) + ((y)*100) + (z))
+
+#if defined(__has_builtin)
+#define RAPIDJSON_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define RAPIDJSON_HAS_BUILTIN(x) 0
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_DIAG_PUSH/POP, RAPIDJSON_DIAG_OFF
+
+#if defined(__GNUC__)
+#define RAPIDJSON_GNUC \
+    RAPIDJSON_VERSION_CODE(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__)
+#endif
+
+#if defined(__clang__) || (defined(RAPIDJSON_GNUC) && RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,2,0))
+
+#define RAPIDJSON_PRAGMA(x) _Pragma(RAPIDJSON_STRINGIFY(x))
+#define RAPIDJSON_DIAG_PRAGMA(x) RAPIDJSON_PRAGMA(GCC diagnostic x)
+#define RAPIDJSON_DIAG_OFF(x) \
+    RAPIDJSON_DIAG_PRAGMA(ignored RAPIDJSON_STRINGIFY(RAPIDJSON_JOIN(-W,x)))
+
+// push/pop support in Clang and GCC>=4.6
+#if defined(__clang__) || (defined(RAPIDJSON_GNUC) && RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,6,0))
+#define RAPIDJSON_DIAG_PUSH RAPIDJSON_DIAG_PRAGMA(push)
+#define RAPIDJSON_DIAG_POP  RAPIDJSON_DIAG_PRAGMA(pop)
+#else // GCC >= 4.2, < 4.6
+#define RAPIDJSON_DIAG_PUSH /* ignored */
+#define RAPIDJSON_DIAG_POP /* ignored */
+#endif
+
+#elif defined(_MSC_VER)
+
+// pragma (MSVC specific)
+#define RAPIDJSON_PRAGMA(x) __pragma(x)
+#define RAPIDJSON_DIAG_PRAGMA(x) RAPIDJSON_PRAGMA(warning(x))
+
+#define RAPIDJSON_DIAG_OFF(x) RAPIDJSON_DIAG_PRAGMA(disable: x)
+#define RAPIDJSON_DIAG_PUSH RAPIDJSON_DIAG_PRAGMA(push)
+#define RAPIDJSON_DIAG_POP  RAPIDJSON_DIAG_PRAGMA(pop)
+
+#else
+
+#define RAPIDJSON_DIAG_OFF(x) /* ignored */
+#define RAPIDJSON_DIAG_PUSH   /* ignored */
+#define RAPIDJSON_DIAG_POP    /* ignored */
+
+#endif // RAPIDJSON_DIAG_*
+
+///////////////////////////////////////////////////////////////////////////////
+// C++11 features
+
+#ifndef RAPIDJSON_HAS_CXX11
+#define RAPIDJSON_HAS_CXX11 (RAPIDJSON_CPLUSPLUS >= 201103L)
+#endif
+
+#ifndef RAPIDJSON_HAS_CXX11_RVALUE_REFS
+#if RAPIDJSON_HAS_CXX11
+#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1
+#elif defined(__clang__)
+#if __has_feature(cxx_rvalue_references) && \
+    (defined(_MSC_VER) || defined(_LIBCPP_VERSION) || defined(__GLIBCXX__) && __GLIBCXX__ >= 20080306)
+#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1
+#else
+#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 0
+#endif
+#elif (defined(RAPIDJSON_GNUC) && (RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,3,0)) && defined(__GXX_EXPERIMENTAL_CXX0X__)) || \
+      (defined(_MSC_VER) && _MSC_VER >= 1600) || \
+      (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x5140 && defined(__GXX_EXPERIMENTAL_CXX0X__))
+
+#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1
+#else
+#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 0
+#endif
+#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+#include <utility> // std::move
+#endif
+
+#ifndef RAPIDJSON_HAS_CXX11_NOEXCEPT
+#if RAPIDJSON_HAS_CXX11
+#define RAPIDJSON_HAS_CXX11_NOEXCEPT 1
+#elif defined(__clang__)
+#define RAPIDJSON_HAS_CXX11_NOEXCEPT __has_feature(cxx_noexcept)
+#elif (defined(RAPIDJSON_GNUC) && (RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,6,0)) && defined(__GXX_EXPERIMENTAL_CXX0X__)) || \
+    (defined(_MSC_VER) && _MSC_VER >= 1900) || \
+    (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x5140 && defined(__GXX_EXPERIMENTAL_CXX0X__))
+#define RAPIDJSON_HAS_CXX11_NOEXCEPT 1
+#else
+#define RAPIDJSON_HAS_CXX11_NOEXCEPT 0
+#endif
+#endif
+#ifndef RAPIDJSON_NOEXCEPT
+#if RAPIDJSON_HAS_CXX11_NOEXCEPT
+#define RAPIDJSON_NOEXCEPT noexcept
+#else
+#define RAPIDJSON_NOEXCEPT throw()
+#endif // RAPIDJSON_HAS_CXX11_NOEXCEPT
+#endif
+
+// no automatic detection, yet
+#ifndef RAPIDJSON_HAS_CXX11_TYPETRAITS
+#if (defined(_MSC_VER) && _MSC_VER >= 1700)
+#define RAPIDJSON_HAS_CXX11_TYPETRAITS 1
+#else
+#define RAPIDJSON_HAS_CXX11_TYPETRAITS 0
+#endif
+#endif
+
+#ifndef RAPIDJSON_HAS_CXX11_RANGE_FOR
+#if defined(__clang__)
+#define RAPIDJSON_HAS_CXX11_RANGE_FOR __has_feature(cxx_range_for)
+#elif (defined(RAPIDJSON_GNUC) && (RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,6,0)) && defined(__GXX_EXPERIMENTAL_CXX0X__)) || \
+      (defined(_MSC_VER) && _MSC_VER >= 1700) || \
+      (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x5140 && defined(__GXX_EXPERIMENTAL_CXX0X__))
+#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1
+#else
+#define RAPIDJSON_HAS_CXX11_RANGE_FOR 0
+#endif
+#endif // RAPIDJSON_HAS_CXX11_RANGE_FOR
+
+///////////////////////////////////////////////////////////////////////////////
+// C++17 features
+
+#ifndef RAPIDJSON_HAS_CXX17
+#define RAPIDJSON_HAS_CXX17 (RAPIDJSON_CPLUSPLUS >= 201703L)
+#endif
+
+#if RAPIDJSON_HAS_CXX17
+# define RAPIDJSON_DELIBERATE_FALLTHROUGH [[fallthrough]]
+#elif defined(__has_cpp_attribute)
+# if __has_cpp_attribute(clang::fallthrough)
+#  define RAPIDJSON_DELIBERATE_FALLTHROUGH [[clang::fallthrough]]
+# elif __has_cpp_attribute(fallthrough)
+#  define RAPIDJSON_DELIBERATE_FALLTHROUGH __attribute__((fallthrough))
+# else
+#  define RAPIDJSON_DELIBERATE_FALLTHROUGH
+# endif
+#else
+# define RAPIDJSON_DELIBERATE_FALLTHROUGH
+#endif
+
+//!@endcond
+
+//! Assertion (in non-throwing contexts).
+ /*! \ingroup RAPIDJSON_CONFIG
+    Some functions provide a \c noexcept guarantee, if the compiler supports it.
+    In these cases, the \ref RAPIDJSON_ASSERT macro cannot be overridden to
+    throw an exception.  This macro adds a separate customization point for
+    such cases.
+
+    Defaults to C \c assert() (as \ref RAPIDJSON_ASSERT), if \c noexcept is
+    supported, and to \ref RAPIDJSON_ASSERT otherwise.
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_NOEXCEPT_ASSERT
+
+#ifndef RAPIDJSON_NOEXCEPT_ASSERT
+#ifdef RAPIDJSON_ASSERT_THROWS
+#include <cassert>
+#define RAPIDJSON_NOEXCEPT_ASSERT(x) assert(x)
+#else
+#define RAPIDJSON_NOEXCEPT_ASSERT(x) RAPIDJSON_ASSERT(x)
+#endif // RAPIDJSON_ASSERT_THROWS
+#endif // RAPIDJSON_NOEXCEPT_ASSERT
+
+///////////////////////////////////////////////////////////////////////////////
+// malloc/realloc/free
+
+#ifndef RAPIDJSON_MALLOC
+///! customization point for global \c malloc
+#define RAPIDJSON_MALLOC(size) std::malloc(size)
+#endif
+#ifndef RAPIDJSON_REALLOC
+///! customization point for global \c realloc
+#define RAPIDJSON_REALLOC(ptr, new_size) std::realloc(ptr, new_size)
+#endif
+#ifndef RAPIDJSON_FREE
+///! customization point for global \c free
+#define RAPIDJSON_FREE(ptr) std::free(ptr)
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// new/delete
+
+#ifndef RAPIDJSON_NEW
+///! customization point for global \c new
+#define RAPIDJSON_NEW(TypeName) new TypeName
+#endif
+#ifndef RAPIDJSON_DELETE
+///! customization point for global \c delete
+#define RAPIDJSON_DELETE(x) delete x
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// Type
+
+/*! \namespace rapidjson
+    \brief main RapidJSON namespace
+    \see RAPIDJSON_NAMESPACE
+*/
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Type of JSON value
+enum Type {
+    kNullType = 0,      //!< null
+    kFalseType = 1,     //!< false
+    kTrueType = 2,      //!< true
+    kObjectType = 3,    //!< object
+    kArrayType = 4,     //!< array
+    kStringType = 5,    //!< string
+    kNumberType = 6     //!< number
+};
+
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_RAPIDJSON_H_
diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h
new file mode 100644
index 0000000000..f7ef610244
--- /dev/null
+++ b/include/rapidjson/reader.h
@@ -0,0 +1,2246 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_READER_H_
+#define RAPIDJSON_READER_H_
+
+/*! \file reader.h */
+
+#include "allocators.h"
+#include "stream.h"
+#include "encodedstream.h"
+#include "internal/clzll.h"
+#include "internal/meta.h"
+#include "internal/stack.h"
+#include "internal/strtod.h"
+#include <limits>
+
+#if defined(RAPIDJSON_SIMD) && defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanForward)
+#endif
+#ifdef RAPIDJSON_SSE42
+#include <nmmintrin.h>
+#elif defined(RAPIDJSON_SSE2)
+#include <emmintrin.h>
+#elif defined(RAPIDJSON_NEON)
+#include <arm_neon.h>
+#endif
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(old-style-cast)
+RAPIDJSON_DIAG_OFF(padded)
+RAPIDJSON_DIAG_OFF(switch-enum)
+#elif defined(_MSC_VER)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(4127)  // conditional expression is constant
+RAPIDJSON_DIAG_OFF(4702)  // unreachable code
+#endif
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(effc++)
+#endif
+
+//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
+#define RAPIDJSON_NOTHING /* deliberately empty */
+#ifndef RAPIDJSON_PARSE_ERROR_EARLY_RETURN
+#define RAPIDJSON_PARSE_ERROR_EARLY_RETURN(value) \
+    RAPIDJSON_MULTILINEMACRO_BEGIN \
+    if (RAPIDJSON_UNLIKELY(HasParseError())) { return value; } \
+    RAPIDJSON_MULTILINEMACRO_END
+#endif
+#define RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID \
+    RAPIDJSON_PARSE_ERROR_EARLY_RETURN(RAPIDJSON_NOTHING)
+//!@endcond
+
+/*! \def RAPIDJSON_PARSE_ERROR_NORETURN
+    \ingroup RAPIDJSON_ERRORS
+    \brief Macro to indicate a parse error.
+    \param parseErrorCode \ref rapidjson::ParseErrorCode of the error
+    \param offset  position of the error in JSON input (\c size_t)
+
+    This macros can be used as a customization point for the internal
+    error handling mechanism of RapidJSON.
+
+    A common usage model is to throw an exception instead of requiring the
+    caller to explicitly check the \ref rapidjson::GenericReader::Parse's
+    return value:
+
+    \code
+    #define RAPIDJSON_PARSE_ERROR_NORETURN(parseErrorCode,offset) \
+       throw ParseException(parseErrorCode, #parseErrorCode, offset)
+
+    #include <stdexcept>               // std::runtime_error
+    #include "rapidjson/error/error.h" // rapidjson::ParseResult
+
+    struct ParseException : std::runtime_error, rapidjson::ParseResult {
+      ParseException(rapidjson::ParseErrorCode code, const char* msg, size_t offset)
+        : std::runtime_error(msg), ParseResult(code, offset) {}
+    };
+
+    #include "rapidjson/reader.h"
+    \endcode
+
+    \see RAPIDJSON_PARSE_ERROR, rapidjson::GenericReader::Parse
+ */
+#ifndef RAPIDJSON_PARSE_ERROR_NORETURN
+#define RAPIDJSON_PARSE_ERROR_NORETURN(parseErrorCode, offset) \
+    RAPIDJSON_MULTILINEMACRO_BEGIN \
+    RAPIDJSON_ASSERT(!HasParseError()); /* Error can only be assigned once */ \
+    SetParseError(parseErrorCode, offset); \
+    RAPIDJSON_MULTILINEMACRO_END
+#endif
+
+/*! \def RAPIDJSON_PARSE_ERROR
+    \ingroup RAPIDJSON_ERRORS
+    \brief (Internal) macro to indicate and handle a parse error.
+    \param parseErrorCode \ref rapidjson::ParseErrorCode of the error
+    \param offset  position of the error in JSON input (\c size_t)
+
+    Invokes RAPIDJSON_PARSE_ERROR_NORETURN and stops the parsing.
+
+    \see RAPIDJSON_PARSE_ERROR_NORETURN
+    \hideinitializer
+ */
+#ifndef RAPIDJSON_PARSE_ERROR
+#define RAPIDJSON_PARSE_ERROR(parseErrorCode, offset) \
+    RAPIDJSON_MULTILINEMACRO_BEGIN \
+    RAPIDJSON_PARSE_ERROR_NORETURN(parseErrorCode, offset); \
+    RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; \
+    RAPIDJSON_MULTILINEMACRO_END
+#endif
+
+#include "error/error.h" // ParseErrorCode, ParseResult
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+// ParseFlag
+
+/*! \def RAPIDJSON_PARSE_DEFAULT_FLAGS
+    \ingroup RAPIDJSON_CONFIG
+    \brief User-defined kParseDefaultFlags definition.
+
+    User can define this as any \c ParseFlag combinations.
+*/
+#ifndef RAPIDJSON_PARSE_DEFAULT_FLAGS
+#define RAPIDJSON_PARSE_DEFAULT_FLAGS kParseNoFlags
+#endif
+
+//! Combination of parseFlags
+/*! \see Reader::Parse, Document::Parse, Document::ParseInsitu, Document::ParseStream
+ */
+enum ParseFlag {
+    kParseNoFlags = 0,              //!< No flags are set.
+    kParseInsituFlag = 1,           //!< In-situ(destructive) parsing.
+    kParseValidateEncodingFlag = 2, //!< Validate encoding of JSON strings.
+    kParseIterativeFlag = 4,        //!< Iterative(constant complexity in terms of function call stack size) parsing.
+    kParseStopWhenDoneFlag = 8,     //!< After parsing a complete JSON root from stream, stop further processing the rest of stream. When this flag is used, parser will not generate kParseErrorDocumentRootNotSingular error.
+    kParseFullPrecisionFlag = 16,   //!< Parse number in full precision (but slower).
+    kParseCommentsFlag = 32,        //!< Allow one-line (//) and multi-line (/**/) comments.
+    kParseNumbersAsStringsFlag = 64,    //!< Parse all numbers (ints/doubles) as strings.
+    kParseTrailingCommasFlag = 128, //!< Allow trailing commas at the end of objects and arrays.
+    kParseNanAndInfFlag = 256,      //!< Allow parsing NaN, Inf, Infinity, -Inf and -Infinity as doubles.
+    kParseEscapedApostropheFlag = 512,  //!< Allow escaped apostrophe in strings.
+    kParseDefaultFlags = RAPIDJSON_PARSE_DEFAULT_FLAGS  //!< Default parse flags. Can be customized by defining RAPIDJSON_PARSE_DEFAULT_FLAGS
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Handler
+
+/*! \class rapidjson::Handler
+    \brief Concept for receiving events from GenericReader upon parsing.
+    The functions return true if no error occurs. If they return false,
+    the event publisher should terminate the process.
+\code
+concept Handler {
+    typename Ch;
+
+    bool Null();
+    bool Bool(bool b);
+    bool Int(int i);
+    bool Uint(unsigned i);
+    bool Int64(int64_t i);
+    bool Uint64(uint64_t i);
+    bool Double(double d);
+    /// enabled via kParseNumbersAsStringsFlag, string is not null-terminated (use length)
+    bool RawNumber(const Ch* str, SizeType length, bool copy);
+    bool String(const Ch* str, SizeType length, bool copy);
+    bool StartObject();
+    bool Key(const Ch* str, SizeType length, bool copy);
+    bool EndObject(SizeType memberCount);
+    bool StartArray();
+    bool EndArray(SizeType elementCount);
+};
+\endcode
+*/
+///////////////////////////////////////////////////////////////////////////////
+// BaseReaderHandler
+
+//! Default implementation of Handler.
+/*! This can be used as base class of any reader handler.
+    \note implements Handler concept
+*/
+template<typename Encoding = UTF8<>, typename Derived = void>
+struct BaseReaderHandler {
+    typedef typename Encoding::Ch Ch;
+
+    typedef typename internal::SelectIf<internal::IsSame<Derived, void>, BaseReaderHandler, Derived>::Type Override;
+
+    bool Default() { return true; }
+    bool Null() { return static_cast<Override&>(*this).Default(); }
+    bool Bool(bool) { return static_cast<Override&>(*this).Default(); }
+    bool Int(int) { return static_cast<Override&>(*this).Default(); }
+    bool Uint(unsigned) { return static_cast<Override&>(*this).Default(); }
+    bool Int64(int64_t) { return static_cast<Override&>(*this).Default(); }
+    bool Uint64(uint64_t) { return static_cast<Override&>(*this).Default(); }
+    bool Double(double) { return static_cast<Override&>(*this).Default(); }
+    /// enabled via kParseNumbersAsStringsFlag, string is not null-terminated (use length)
+    bool RawNumber(const Ch* str, SizeType len, bool copy) { return static_cast<Override&>(*this).String(str, len, copy); }
+    bool String(const Ch*, SizeType, bool) { return static_cast<Override&>(*this).Default(); }
+    bool StartObject() { return static_cast<Override&>(*this).Default(); }
+    bool Key(const Ch* str, SizeType len, bool copy) { return static_cast<Override&>(*this).String(str, len, copy); }
+    bool EndObject(SizeType) { return static_cast<Override&>(*this).Default(); }
+    bool StartArray() { return static_cast<Override&>(*this).Default(); }
+    bool EndArray(SizeType) { return static_cast<Override&>(*this).Default(); }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// StreamLocalCopy
+
+namespace internal {
+
+template<typename Stream, int = StreamTraits<Stream>::copyOptimization>
+class StreamLocalCopy;
+
+//! Do copy optimization.
+template<typename Stream>
+class StreamLocalCopy<Stream, 1> {
+public:
+    StreamLocalCopy(Stream& original) : s(original), original_(original) {}
+    ~StreamLocalCopy() { original_ = s; }
+
+    Stream s;
+
+private:
+    StreamLocalCopy& operator=(const StreamLocalCopy&) /* = delete */;
+
+    Stream& original_;
+};
+
+//! Keep reference.
+template<typename Stream>
+class StreamLocalCopy<Stream, 0> {
+public:
+    StreamLocalCopy(Stream& original) : s(original) {}
+
+    Stream& s;
+
+private:
+    StreamLocalCopy& operator=(const StreamLocalCopy&) /* = delete */;
+};
+
+} // namespace internal
+
+///////////////////////////////////////////////////////////////////////////////
+// SkipWhitespace
+
+//! Skip the JSON white spaces in a stream.
+/*! \param is A input stream for skipping white spaces.
+    \note This function has SSE2/SSE4.2 specialization.
+*/
+template<typename InputStream>
+void SkipWhitespace(InputStream& is) {
+    internal::StreamLocalCopy<InputStream> copy(is);
+    InputStream& s(copy.s);
+
+    typename InputStream::Ch c;
+    while ((c = s.Peek()) == ' ' || c == '\n' || c == '\r' || c == '\t')
+        s.Take();
+}
+
+inline const char* SkipWhitespace(const char* p, const char* end) {
+    while (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
+        ++p;
+    return p;
+}
+
+#ifdef RAPIDJSON_SSE42
+//! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once.
+inline const char *SkipWhitespace_SIMD(const char* p) {
+    // Fast return for single non-whitespace
+    if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')
+        ++p;
+    else
+        return p;
+
+    // 16-byte align to the next boundary
+    const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+    while (p != nextAligned)
+        if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')
+            ++p;
+        else
+            return p;
+
+    // The rest of string using SIMD
+    static const char whitespace[16] = " \n\r\t";
+    const __m128i w = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespace[0]));
+
+    for (;; p += 16) {
+        const __m128i s = _mm_load_si128(reinterpret_cast<const __m128i *>(p));
+        const int r = _mm_cmpistri(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT | _SIDD_NEGATIVE_POLARITY);
+        if (r != 16)    // some of characters is non-whitespace
+            return p + r;
+    }
+}
+
+inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
+    // Fast return for single non-whitespace
+    if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
+        ++p;
+    else
+        return p;
+
+    // The middle of string using SIMD
+    static const char whitespace[16] = " \n\r\t";
+    const __m128i w = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespace[0]));
+
+    for (; p <= end - 16; p += 16) {
+        const __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
+        const int r = _mm_cmpistri(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT | _SIDD_NEGATIVE_POLARITY);
+        if (r != 16)    // some of characters is non-whitespace
+            return p + r;
+    }
+
+    return SkipWhitespace(p, end);
+}
+
+#elif defined(RAPIDJSON_SSE2)
+
+//! Skip whitespace with SSE2 instructions, testing 16 8-byte characters at once.
+inline const char *SkipWhitespace_SIMD(const char* p) {
+    // Fast return for single non-whitespace
+    if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')
+        ++p;
+    else
+        return p;
+
+    // 16-byte align to the next boundary
+    const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+    while (p != nextAligned)
+        if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')
+            ++p;
+        else
+            return p;
+
+    // The rest of string
+    #define C16(c) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }
+    static const char whitespaces[4][16] = { C16(' '), C16('\n'), C16('\r'), C16('\t') };
+    #undef C16
+
+    const __m128i w0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[0][0]));
+    const __m128i w1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[1][0]));
+    const __m128i w2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[2][0]));
+    const __m128i w3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[3][0]));
+
+    for (;; p += 16) {
+        const __m128i s = _mm_load_si128(reinterpret_cast<const __m128i *>(p));
+        __m128i x = _mm_cmpeq_epi8(s, w0);
+        x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1));
+        x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2));
+        x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3));
+        unsigned short r = static_cast<unsigned short>(~_mm_movemask_epi8(x));
+        if (r != 0) {   // some of characters may be non-whitespace
+#ifdef _MSC_VER         // Find the index of first non-whitespace
+            unsigned long offset;
+            _BitScanForward(&offset, r);
+            return p + offset;
+#else
+            return p + __builtin_ffs(r) - 1;
+#endif
+        }
+    }
+}
+
+inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
+    // Fast return for single non-whitespace
+    if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
+        ++p;
+    else
+        return p;
+
+    // The rest of string
+    #define C16(c) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }
+    static const char whitespaces[4][16] = { C16(' '), C16('\n'), C16('\r'), C16('\t') };
+    #undef C16
+
+    const __m128i w0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[0][0]));
+    const __m128i w1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[1][0]));
+    const __m128i w2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[2][0]));
+    const __m128i w3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[3][0]));
+
+    for (; p <= end - 16; p += 16) {
+        const __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
+        __m128i x = _mm_cmpeq_epi8(s, w0);
+        x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1));
+        x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2));
+        x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3));
+        unsigned short r = static_cast<unsigned short>(~_mm_movemask_epi8(x));
+        if (r != 0) {   // some of characters may be non-whitespace
+#ifdef _MSC_VER         // Find the index of first non-whitespace
+            unsigned long offset;
+            _BitScanForward(&offset, r);
+            return p + offset;
+#else
+            return p + __builtin_ffs(r) - 1;
+#endif
+        }
+    }
+
+    return SkipWhitespace(p, end);
+}
+
+#elif defined(RAPIDJSON_NEON)
+
+//! Skip whitespace with ARM Neon instructions, testing 16 8-byte characters at once.
+inline const char *SkipWhitespace_SIMD(const char* p) {
+    // Fast return for single non-whitespace
+    if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')
+        ++p;
+    else
+        return p;
+
+    // 16-byte align to the next boundary
+    const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+    while (p != nextAligned)
+        if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')
+            ++p;
+        else
+            return p;
+
+    const uint8x16_t w0 = vmovq_n_u8(' ');
+    const uint8x16_t w1 = vmovq_n_u8('\n');
+    const uint8x16_t w2 = vmovq_n_u8('\r');
+    const uint8x16_t w3 = vmovq_n_u8('\t');
+
+    for (;; p += 16) {
+        const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p));
+        uint8x16_t x = vceqq_u8(s, w0);
+        x = vorrq_u8(x, vceqq_u8(s, w1));
+        x = vorrq_u8(x, vceqq_u8(s, w2));
+        x = vorrq_u8(x, vceqq_u8(s, w3));
+
+        x = vmvnq_u8(x);                       // Negate
+        x = vrev64q_u8(x);                     // Rev in 64
+        uint64_t low = vgetq_lane_u64(vreinterpretq_u64_u8(x), 0);   // extract
+        uint64_t high = vgetq_lane_u64(vreinterpretq_u64_u8(x), 1);  // extract
+
+        if (low == 0) {
+            if (high != 0) {
+                uint32_t lz = internal::clzll(high);
+                return p + 8 + (lz >> 3);
+            }
+        } else {
+            uint32_t lz = internal::clzll(low);
+            return p + (lz >> 3);
+        }
+    }
+}
+
+inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
+    // Fast return for single non-whitespace
+    if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
+        ++p;
+    else
+        return p;
+
+    const uint8x16_t w0 = vmovq_n_u8(' ');
+    const uint8x16_t w1 = vmovq_n_u8('\n');
+    const uint8x16_t w2 = vmovq_n_u8('\r');
+    const uint8x16_t w3 = vmovq_n_u8('\t');
+
+    for (; p <= end - 16; p += 16) {
+        const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p));
+        uint8x16_t x = vceqq_u8(s, w0);
+        x = vorrq_u8(x, vceqq_u8(s, w1));
+        x = vorrq_u8(x, vceqq_u8(s, w2));
+        x = vorrq_u8(x, vceqq_u8(s, w3));
+
+        x = vmvnq_u8(x);                       // Negate
+        x = vrev64q_u8(x);                     // Rev in 64
+        uint64_t low = vgetq_lane_u64(vreinterpretq_u64_u8(x), 0);   // extract
+        uint64_t high = vgetq_lane_u64(vreinterpretq_u64_u8(x), 1);  // extract
+
+        if (low == 0) {
+            if (high != 0) {
+                uint32_t lz = internal::clzll(high);
+                return p + 8 + (lz >> 3);
+            }
+        } else {
+            uint32_t lz = internal::clzll(low);
+            return p + (lz >> 3);
+        }
+    }
+
+    return SkipWhitespace(p, end);
+}
+
+#endif // RAPIDJSON_NEON
+
+#ifdef RAPIDJSON_SIMD
+//! Template function specialization for InsituStringStream
+template<> inline void SkipWhitespace(InsituStringStream& is) {
+    is.src_ = const_cast<char*>(SkipWhitespace_SIMD(is.src_));
+}
+
+//! Template function specialization for StringStream
+template<> inline void SkipWhitespace(StringStream& is) {
+    is.src_ = SkipWhitespace_SIMD(is.src_);
+}
+
+template<> inline void SkipWhitespace(EncodedInputStream<UTF8<>, MemoryStream>& is) {
+    is.is_.src_ = SkipWhitespace_SIMD(is.is_.src_, is.is_.end_);
+}
+#endif // RAPIDJSON_SIMD
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericReader
+
+//! SAX-style JSON parser. Use \ref Reader for UTF8 encoding and default allocator.
+/*! GenericReader parses JSON text from a stream, and send events synchronously to an
+    object implementing Handler concept.
+
+    It needs to allocate a stack for storing a single decoded string during
+    non-destructive parsing.
+
+    For in-situ parsing, the decoded string is directly written to the source
+    text string, no temporary buffer is required.
+
+    A GenericReader object can be reused for parsing multiple JSON text.
+
+    \tparam SourceEncoding Encoding of the input stream.
+    \tparam TargetEncoding Encoding of the parse output.
+    \tparam StackAllocator Allocator type for stack.
+*/
+template <typename SourceEncoding, typename TargetEncoding, typename StackAllocator = CrtAllocator>
+class GenericReader {
+public:
+    typedef typename SourceEncoding::Ch Ch; //!< SourceEncoding character type
+
+    //! Constructor.
+    /*! \param stackAllocator Optional allocator for allocating stack memory. (Only use for non-destructive parsing)
+        \param stackCapacity stack capacity in bytes for storing a single decoded string.  (Only use for non-destructive parsing)
+    */
+    GenericReader(StackAllocator* stackAllocator = 0, size_t stackCapacity = kDefaultStackCapacity) :
+        stack_(stackAllocator, stackCapacity), parseResult_(), state_(IterativeParsingStartState) {}
+
+    //! Parse JSON text.
+    /*! \tparam parseFlags Combination of \ref ParseFlag.
+        \tparam InputStream Type of input stream, implementing Stream concept.
+        \tparam Handler Type of handler, implementing Handler concept.
+        \param is Input stream to be parsed.
+        \param handler The handler to receive events.
+        \return Whether the parsing is successful.
+    */
+    template <unsigned parseFlags, typename InputStream, typename Handler>
+    ParseResult Parse(InputStream& is, Handler& handler) {
+        if (parseFlags & kParseIterativeFlag)
+            return IterativeParse<parseFlags>(is, handler);
+
+        parseResult_.Clear();
+
+        ClearStackOnExit scope(*this);
+
+        SkipWhitespaceAndComments<parseFlags>(is);
+        RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_);
+
+        if (RAPIDJSON_UNLIKELY(is.Peek() == '\0')) {
+            RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorDocumentEmpty, is.Tell());
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_);
+        }
+        else {
+            ParseValue<parseFlags>(is, handler);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_);
+
+            if (!(parseFlags & kParseStopWhenDoneFlag)) {
+                SkipWhitespaceAndComments<parseFlags>(is);
+                RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_);
+
+                if (RAPIDJSON_UNLIKELY(is.Peek() != '\0')) {
+                    RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorDocumentRootNotSingular, is.Tell());
+                    RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_);
+                }
+            }
+        }
+
+        return parseResult_;
+    }
+
+    //! Parse JSON text (with \ref kParseDefaultFlags)
+    /*! \tparam InputStream Type of input stream, implementing Stream concept
+        \tparam Handler Type of handler, implementing Handler concept.
+        \param is Input stream to be parsed.
+        \param handler The handler to receive events.
+        \return Whether the parsing is successful.
+    */
+    template <typename InputStream, typename Handler>
+    ParseResult Parse(InputStream& is, Handler& handler) {
+        return Parse<kParseDefaultFlags>(is, handler);
+    }
+
+    //! Initialize JSON text token-by-token parsing
+    /*!
+     */
+    void IterativeParseInit() {
+        parseResult_.Clear();
+        state_ = IterativeParsingStartState;
+    }
+
+    //! Parse one token from JSON text
+    /*! \tparam InputStream Type of input stream, implementing Stream concept
+        \tparam Handler Type of handler, implementing Handler concept.
+        \param is Input stream to be parsed.
+        \param handler The handler to receive events.
+        \return Whether the parsing is successful.
+     */
+    template <unsigned parseFlags, typename InputStream, typename Handler>
+    bool IterativeParseNext(InputStream& is, Handler& handler) {
+        while (RAPIDJSON_LIKELY(is.Peek() != '\0')) {
+            SkipWhitespaceAndComments<parseFlags>(is);
+
+            Token t = Tokenize(is.Peek());
+            IterativeParsingState n = Predict(state_, t);
+            IterativeParsingState d = Transit<parseFlags>(state_, t, n, is, handler);
+
+            // If we've finished or hit an error...
+            if (RAPIDJSON_UNLIKELY(IsIterativeParsingCompleteState(d))) {
+                // Report errors.
+                if (d == IterativeParsingErrorState) {
+                    HandleError(state_, is);
+                    return false;
+                }
+
+                // Transition to the finish state.
+                RAPIDJSON_ASSERT(d == IterativeParsingFinishState);
+                state_ = d;
+
+                // If StopWhenDone is not set...
+                if (!(parseFlags & kParseStopWhenDoneFlag)) {
+                    // ... and extra non-whitespace data is found...
+                    SkipWhitespaceAndComments<parseFlags>(is);
+                    if (is.Peek() != '\0') {
+                        // ... this is considered an error.
+                        HandleError(state_, is);
+                        return false;
+                    }
+                }
+
+                // Success! We are done!
+                return true;
+            }
+
+            // Transition to the new state.
+            state_ = d;
+
+            // If we parsed anything other than a delimiter, we invoked the handler, so we can return true now.
+            if (!IsIterativeParsingDelimiterState(n))
+                return true;
+        }
+
+        // We reached the end of file.
+        stack_.Clear();
+
+        if (state_ != IterativeParsingFinishState) {
+            HandleError(state_, is);
+            return false;
+        }
+
+        return true;
+    }
+
+    //! Check if token-by-token parsing JSON text is complete
+    /*! \return Whether the JSON has been fully decoded.
+     */
+    RAPIDJSON_FORCEINLINE bool IterativeParseComplete() const {
+        return IsIterativeParsingCompleteState(state_);
+    }
+
+    //! Whether a parse error has occurred in the last parsing.
+    bool HasParseError() const { return parseResult_.IsError(); }
+
+    //! Get the \ref ParseErrorCode of last parsing.
+    ParseErrorCode GetParseErrorCode() const { return parseResult_.Code(); }
+
+    //! Get the position of last parsing error in input, 0 otherwise.
+    size_t GetErrorOffset() const { return parseResult_.Offset(); }
+
+protected:
+    void SetParseError(ParseErrorCode code, size_t offset) { parseResult_.Set(code, offset); }
+
+private:
+    // Prohibit copy constructor & assignment operator.
+    GenericReader(const GenericReader&);
+    GenericReader& operator=(const GenericReader&);
+
+    void ClearStack() { stack_.Clear(); }
+
+    // clear stack on any exit from ParseStream, e.g. due to exception
+    struct ClearStackOnExit {
+        explicit ClearStackOnExit(GenericReader& r) : r_(r) {}
+        ~ClearStackOnExit() { r_.ClearStack(); }
+    private:
+        GenericReader& r_;
+        ClearStackOnExit(const ClearStackOnExit&);
+        ClearStackOnExit& operator=(const ClearStackOnExit&);
+    };
+
+    template<unsigned parseFlags, typename InputStream>
+    void SkipWhitespaceAndComments(InputStream& is) {
+        SkipWhitespace(is);
+
+        if (parseFlags & kParseCommentsFlag) {
+            while (RAPIDJSON_UNLIKELY(Consume(is, '/'))) {
+                if (Consume(is, '*')) {
+                    while (true) {
+                        if (RAPIDJSON_UNLIKELY(is.Peek() == '\0'))
+                            RAPIDJSON_PARSE_ERROR(kParseErrorUnspecificSyntaxError, is.Tell());
+                        else if (Consume(is, '*')) {
+                            if (Consume(is, '/'))
+                                break;
+                        }
+                        else
+                            is.Take();
+                    }
+                }
+                else if (RAPIDJSON_LIKELY(Consume(is, '/')))
+                    while (is.Peek() != '\0' && is.Take() != '\n') {}
+                else
+                    RAPIDJSON_PARSE_ERROR(kParseErrorUnspecificSyntaxError, is.Tell());
+
+                SkipWhitespace(is);
+            }
+        }
+    }
+
+    // Parse object: { string : value, ... }
+    template<unsigned parseFlags, typename InputStream, typename Handler>
+    void ParseObject(InputStream& is, Handler& handler) {
+        RAPIDJSON_ASSERT(is.Peek() == '{');
+        is.Take();  // Skip '{'
+
+        if (RAPIDJSON_UNLIKELY(!handler.StartObject()))
+            RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+
+        SkipWhitespaceAndComments<parseFlags>(is);
+        RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+
+        if (Consume(is, '}')) {
+            if (RAPIDJSON_UNLIKELY(!handler.EndObject(0)))  // empty object
+                RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+            return;
+        }
+
+        for (SizeType memberCount = 0;;) {
+            if (RAPIDJSON_UNLIKELY(is.Peek() != '"'))
+                RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissName, is.Tell());
+
+            ParseString<parseFlags>(is, handler, true);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+
+            SkipWhitespaceAndComments<parseFlags>(is);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+
+            if (RAPIDJSON_UNLIKELY(!Consume(is, ':')))
+                RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissColon, is.Tell());
+
+            SkipWhitespaceAndComments<parseFlags>(is);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+
+            ParseValue<parseFlags>(is, handler);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+
+            SkipWhitespaceAndComments<parseFlags>(is);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+
+            ++memberCount;
+
+            switch (is.Peek()) {
+                case ',':
+                    is.Take();
+                    SkipWhitespaceAndComments<parseFlags>(is);
+                    RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+                    break;
+                case '}':
+                    is.Take();
+                    if (RAPIDJSON_UNLIKELY(!handler.EndObject(memberCount)))
+                        RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+                    return;
+                default:
+                    RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissCommaOrCurlyBracket, is.Tell()); break; // This useless break is only for making warning and coverage happy
+            }
+
+            if (parseFlags & kParseTrailingCommasFlag) {
+                if (is.Peek() == '}') {
+                    if (RAPIDJSON_UNLIKELY(!handler.EndObject(memberCount)))
+                        RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+                    is.Take();
+                    return;
+                }
+            }
+        }
+    }
+
+    // Parse array: [ value, ... ]
+    template<unsigned parseFlags, typename InputStream, typename Handler>
+    void ParseArray(InputStream& is, Handler& handler) {
+        RAPIDJSON_ASSERT(is.Peek() == '[');
+        is.Take();  // Skip '['
+
+        if (RAPIDJSON_UNLIKELY(!handler.StartArray()))
+            RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+
+        SkipWhitespaceAndComments<parseFlags>(is);
+        RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+
+        if (Consume(is, ']')) {
+            if (RAPIDJSON_UNLIKELY(!handler.EndArray(0))) // empty array
+                RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+            return;
+        }
+
+        for (SizeType elementCount = 0;;) {
+            ParseValue<parseFlags>(is, handler);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+
+            ++elementCount;
+            SkipWhitespaceAndComments<parseFlags>(is);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+
+            if (Consume(is, ',')) {
+                SkipWhitespaceAndComments<parseFlags>(is);
+                RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+            }
+            else if (Consume(is, ']')) {
+                if (RAPIDJSON_UNLIKELY(!handler.EndArray(elementCount)))
+                    RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+                return;
+            }
+            else
+                RAPIDJSON_PARSE_ERROR(kParseErrorArrayMissCommaOrSquareBracket, is.Tell());
+
+            if (parseFlags & kParseTrailingCommasFlag) {
+                if (is.Peek() == ']') {
+                    if (RAPIDJSON_UNLIKELY(!handler.EndArray(elementCount)))
+                        RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+                    is.Take();
+                    return;
+                }
+            }
+        }
+    }
+
+    template<unsigned parseFlags, typename InputStream, typename Handler>
+    void ParseNull(InputStream& is, Handler& handler) {
+        RAPIDJSON_ASSERT(is.Peek() == 'n');
+        is.Take();
+
+        if (RAPIDJSON_LIKELY(Consume(is, 'u') && Consume(is, 'l') && Consume(is, 'l'))) {
+            if (RAPIDJSON_UNLIKELY(!handler.Null()))
+                RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+        }
+        else
+            RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell());
+    }
+
+    template<unsigned parseFlags, typename InputStream, typename Handler>
+    void ParseTrue(InputStream& is, Handler& handler) {
+        RAPIDJSON_ASSERT(is.Peek() == 't');
+        is.Take();
+
+        if (RAPIDJSON_LIKELY(Consume(is, 'r') && Consume(is, 'u') && Consume(is, 'e'))) {
+            if (RAPIDJSON_UNLIKELY(!handler.Bool(true)))
+                RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+        }
+        else
+            RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell());
+    }
+
+    template<unsigned parseFlags, typename InputStream, typename Handler>
+    void ParseFalse(InputStream& is, Handler& handler) {
+        RAPIDJSON_ASSERT(is.Peek() == 'f');
+        is.Take();
+
+        if (RAPIDJSON_LIKELY(Consume(is, 'a') && Consume(is, 'l') && Consume(is, 's') && Consume(is, 'e'))) {
+            if (RAPIDJSON_UNLIKELY(!handler.Bool(false)))
+                RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
+        }
+        else
+            RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell());
+    }
+
+    template<typename InputStream>
+    RAPIDJSON_FORCEINLINE static bool Consume(InputStream& is, typename InputStream::Ch expect) {
+        if (RAPIDJSON_LIKELY(is.Peek() == expect)) {
+            is.Take();
+            return true;
+        }
+        else
+            return false;
+    }
+
+    // Helper function to parse four hexadecimal digits in \uXXXX in ParseString().
+    template<typename InputStream>
+    unsigned ParseHex4(InputStream& is, size_t escapeOffset) {
+        unsigned codepoint = 0;
+        for (int i = 0; i < 4; i++) {
+            Ch c = is.Peek();
+            codepoint <<= 4;
+            codepoint += static_cast<unsigned>(c);
+            if (c >= '0' && c <= '9')
+                codepoint -= '0';
+            else if (c >= 'A' && c <= 'F')
+                codepoint -= 'A' - 10;
+            else if (c >= 'a' && c <= 'f')
+                codepoint -= 'a' - 10;
+            else {
+                RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorStringUnicodeEscapeInvalidHex, escapeOffset);
+                RAPIDJSON_PARSE_ERROR_EARLY_RETURN(0);
+            }
+            is.Take();
+        }
+        return codepoint;
+    }
+
+    template <typename CharType>
+    class StackStream {
+    public:
+        typedef CharType Ch;
+
+        StackStream(internal::Stack<StackAllocator>& stack) : stack_(stack), length_(0) {}
+        RAPIDJSON_FORCEINLINE void Put(Ch c) {
+            *stack_.template Push<Ch>() = c;
+            ++length_;
+        }
+
+        RAPIDJSON_FORCEINLINE void* Push(SizeType count) {
+            length_ += count;
+            return stack_.template Push<Ch>(count);
+        }
+
+        size_t Length() const { return length_; }
+
+        Ch* Pop() {
+            return stack_.template Pop<Ch>(length_);
+        }
+
+    private:
+        StackStream(const StackStream&);
+        StackStream& operator=(const StackStream&);
+
+        internal::Stack<StackAllocator>& stack_;
+        SizeType length_;
+    };
+
+    // Parse string and generate String event. Different code paths for kParseInsituFlag.
+    template<unsigned parseFlags, typename InputStream, typename Handler>
+    void ParseString(InputStream& is, Handler& handler, bool isKey = false) {
+        internal::StreamLocalCopy<InputStream> copy(is);
+        InputStream& s(copy.s);
+
+        RAPIDJSON_ASSERT(s.Peek() == '\"');
+        s.Take();  // Skip '\"'
+
+        bool success = false;
+        if (parseFlags & kParseInsituFlag) {
+            typename InputStream::Ch *head = s.PutBegin();
+            ParseStringToStream<parseFlags, SourceEncoding, SourceEncoding>(s, s);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+            size_t length = s.PutEnd(head) - 1;
+            RAPIDJSON_ASSERT(length <= 0xFFFFFFFF);
+            const typename TargetEncoding::Ch* const str = reinterpret_cast<typename TargetEncoding::Ch*>(head);
+            success = (isKey ? handler.Key(str, SizeType(length), false) : handler.String(str, SizeType(length), false));
+        }
+        else {
+            StackStream<typename TargetEncoding::Ch> stackStream(stack_);
+            ParseStringToStream<parseFlags, SourceEncoding, TargetEncoding>(s, stackStream);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+            SizeType length = static_cast<SizeType>(stackStream.Length()) - 1;
+            const typename TargetEncoding::Ch* const str = stackStream.Pop();
+            success = (isKey ? handler.Key(str, length, true) : handler.String(str, length, true));
+        }
+        if (RAPIDJSON_UNLIKELY(!success))
+            RAPIDJSON_PARSE_ERROR(kParseErrorTermination, s.Tell());
+    }
+
+    // Parse string to an output is
+    // This function handles the prefix/suffix double quotes, escaping, and optional encoding validation.
+    template<unsigned parseFlags, typename SEncoding, typename TEncoding, typename InputStream, typename OutputStream>
+    RAPIDJSON_FORCEINLINE void ParseStringToStream(InputStream& is, OutputStream& os) {
+//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
+#define Z16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+        static const char escape[256] = {
+            Z16, Z16, 0, 0,'\"', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '/',
+            Z16, Z16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'\\', 0, 0, 0,
+            0, 0,'\b', 0, 0, 0,'\f', 0, 0, 0, 0, 0, 0, 0,'\n', 0,
+            0, 0,'\r', 0,'\t', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16
+        };
+#undef Z16
+//!@endcond
+
+        for (;;) {
+            // Scan and copy string before "\\\"" or < 0x20. This is an optional optimzation.
+            if (!(parseFlags & kParseValidateEncodingFlag))
+                ScanCopyUnescapedString(is, os);
+
+            Ch c = is.Peek();
+            if (RAPIDJSON_UNLIKELY(c == '\\')) {    // Escape
+                size_t escapeOffset = is.Tell();    // For invalid escaping, report the initial '\\' as error offset
+                is.Take();
+                Ch e = is.Peek();
+                if ((sizeof(Ch) == 1 || unsigned(e) < 256) && RAPIDJSON_LIKELY(escape[static_cast<unsigned char>(e)])) {
+                    is.Take();
+                    os.Put(static_cast<typename TEncoding::Ch>(escape[static_cast<unsigned char>(e)]));
+                }
+                else if ((parseFlags & kParseEscapedApostropheFlag) && RAPIDJSON_LIKELY(e == '\'')) { // Allow escaped apostrophe
+                    is.Take();
+                    os.Put('\'');
+                }
+                else if (RAPIDJSON_LIKELY(e == 'u')) {    // Unicode
+                    is.Take();
+                    unsigned codepoint = ParseHex4(is, escapeOffset);
+                    RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+                    if (RAPIDJSON_UNLIKELY(codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
+                        // high surrogate, check if followed by valid low surrogate
+                        if (RAPIDJSON_LIKELY(codepoint <= 0xDBFF)) {
+                            // Handle UTF-16 surrogate pair
+                            if (RAPIDJSON_UNLIKELY(!Consume(is, '\\') || !Consume(is, 'u')))
+                                RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset);
+                            unsigned codepoint2 = ParseHex4(is, escapeOffset);
+                            RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID;
+                            if (RAPIDJSON_UNLIKELY(codepoint2 < 0xDC00 || codepoint2 > 0xDFFF))
+                                RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset);
+                            codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000;
+                        }
+                        // single low surrogate
+                        else
+                        {
+                            RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset);
+                        }
+                    }
+                    TEncoding::Encode(os, codepoint);
+                }
+                else
+                    RAPIDJSON_PARSE_ERROR(kParseErrorStringEscapeInvalid, escapeOffset);
+            }
+            else if (RAPIDJSON_UNLIKELY(c == '"')) {    // Closing double quote
+                is.Take();
+                os.Put('\0');   // null-terminate the string
+                return;
+            }
+            else if (RAPIDJSON_UNLIKELY(static_cast<unsigned>(c) < 0x20)) { // RFC 4627: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
+                if (c == '\0')
+                    RAPIDJSON_PARSE_ERROR(kParseErrorStringMissQuotationMark, is.Tell());
+                else
+                    RAPIDJSON_PARSE_ERROR(kParseErrorStringInvalidEncoding, is.Tell());
+            }
+            else {
+                size_t offset = is.Tell();
+                if (RAPIDJSON_UNLIKELY((parseFlags & kParseValidateEncodingFlag ?
+                    !Transcoder<SEncoding, TEncoding>::Validate(is, os) :
+                    !Transcoder<SEncoding, TEncoding>::Transcode(is, os))))
+                    RAPIDJSON_PARSE_ERROR(kParseErrorStringInvalidEncoding, offset);
+            }
+        }
+    }
+
+    template<typename InputStream, typename OutputStream>
+    static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(InputStream&, OutputStream&) {
+            // Do nothing for generic version
+    }
+
+#if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42)
+    // StringStream -> StackStream<char>
+    static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(StringStream& is, StackStream<char>& os) {
+        const char* p = is.src_;
+
+        // Scan one by one until alignment (unaligned load may cross page boundary and cause crash)
+        const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+        while (p != nextAligned)
+            if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) {
+                is.src_ = p;
+                return;
+            }
+            else
+                os.Put(*p++);
+
+        // The rest of string using SIMD
+        static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' };
+        static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' };
+        static const char space[16]  = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F };
+        const __m128i dq = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&dquote[0]));
+        const __m128i bs = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&bslash[0]));
+        const __m128i sp = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&space[0]));
+
+        for (;; p += 16) {
+            const __m128i s = _mm_load_si128(reinterpret_cast<const __m128i *>(p));
+            const __m128i t1 = _mm_cmpeq_epi8(s, dq);
+            const __m128i t2 = _mm_cmpeq_epi8(s, bs);
+            const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F
+            const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3);
+            unsigned short r = static_cast<unsigned short>(_mm_movemask_epi8(x));
+            if (RAPIDJSON_UNLIKELY(r != 0)) {   // some of characters is escaped
+                SizeType length;
+    #ifdef _MSC_VER         // Find the index of first escaped
+                unsigned long offset;
+                _BitScanForward(&offset, r);
+                length = offset;
+    #else
+                length = static_cast<SizeType>(__builtin_ffs(r) - 1);
+    #endif
+                if (length != 0) {
+                    char* q = reinterpret_cast<char*>(os.Push(length));
+                    for (size_t i = 0; i < length; i++)
+                        q[i] = p[i];
+
+                    p += length;
+                }
+                break;
+            }
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(os.Push(16)), s);
+        }
+
+        is.src_ = p;
+    }
+
+    // InsituStringStream -> InsituStringStream
+    static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(InsituStringStream& is, InsituStringStream& os) {
+        RAPIDJSON_ASSERT(&is == &os);
+        (void)os;
+
+        if (is.src_ == is.dst_) {
+            SkipUnescapedString(is);
+            return;
+        }
+
+        char* p = is.src_;
+        char *q = is.dst_;
+
+        // Scan one by one until alignment (unaligned load may cross page boundary and cause crash)
+        const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+        while (p != nextAligned)
+            if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) {
+                is.src_ = p;
+                is.dst_ = q;
+                return;
+            }
+            else
+                *q++ = *p++;
+
+        // The rest of string using SIMD
+        static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' };
+        static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' };
+        static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F };
+        const __m128i dq = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&dquote[0]));
+        const __m128i bs = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&bslash[0]));
+        const __m128i sp = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&space[0]));
+
+        for (;; p += 16, q += 16) {
+            const __m128i s = _mm_load_si128(reinterpret_cast<const __m128i *>(p));
+            const __m128i t1 = _mm_cmpeq_epi8(s, dq);
+            const __m128i t2 = _mm_cmpeq_epi8(s, bs);
+            const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F
+            const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3);
+            unsigned short r = static_cast<unsigned short>(_mm_movemask_epi8(x));
+            if (RAPIDJSON_UNLIKELY(r != 0)) {   // some of characters is escaped
+                size_t length;
+#ifdef _MSC_VER         // Find the index of first escaped
+                unsigned long offset;
+                _BitScanForward(&offset, r);
+                length = offset;
+#else
+                length = static_cast<size_t>(__builtin_ffs(r) - 1);
+#endif
+                for (const char* pend = p + length; p != pend; )
+                    *q++ = *p++;
+                break;
+            }
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(q), s);
+        }
+
+        is.src_ = p;
+        is.dst_ = q;
+    }
+
+    // When read/write pointers are the same for insitu stream, just skip unescaped characters
+    static RAPIDJSON_FORCEINLINE void SkipUnescapedString(InsituStringStream& is) {
+        RAPIDJSON_ASSERT(is.src_ == is.dst_);
+        char* p = is.src_;
+
+        // Scan one by one until alignment (unaligned load may cross page boundary and cause crash)
+        const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+        for (; p != nextAligned; p++)
+            if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) {
+                is.src_ = is.dst_ = p;
+                return;
+            }
+
+        // The rest of string using SIMD
+        static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' };
+        static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' };
+        static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F };
+        const __m128i dq = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&dquote[0]));
+        const __m128i bs = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&bslash[0]));
+        const __m128i sp = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&space[0]));
+
+        for (;; p += 16) {
+            const __m128i s = _mm_load_si128(reinterpret_cast<const __m128i *>(p));
+            const __m128i t1 = _mm_cmpeq_epi8(s, dq);
+            const __m128i t2 = _mm_cmpeq_epi8(s, bs);
+            const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F
+            const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3);
+            unsigned short r = static_cast<unsigned short>(_mm_movemask_epi8(x));
+            if (RAPIDJSON_UNLIKELY(r != 0)) {   // some of characters is escaped
+                size_t length;
+#ifdef _MSC_VER         // Find the index of first escaped
+                unsigned long offset;
+                _BitScanForward(&offset, r);
+                length = offset;
+#else
+                length = static_cast<size_t>(__builtin_ffs(r) - 1);
+#endif
+                p += length;
+                break;
+            }
+        }
+
+        is.src_ = is.dst_ = p;
+    }
+#elif defined(RAPIDJSON_NEON)
+    // StringStream -> StackStream<char>
+    static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(StringStream& is, StackStream<char>& os) {
+        const char* p = is.src_;
+
+        // Scan one by one until alignment (unaligned load may cross page boundary and cause crash)
+        const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+        while (p != nextAligned)
+            if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) {
+                is.src_ = p;
+                return;
+            }
+            else
+                os.Put(*p++);
+
+        // The rest of string using SIMD
+        const uint8x16_t s0 = vmovq_n_u8('"');
+        const uint8x16_t s1 = vmovq_n_u8('\\');
+        const uint8x16_t s2 = vmovq_n_u8('\b');
+        const uint8x16_t s3 = vmovq_n_u8(32);
+
+        for (;; p += 16) {
+            const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p));
+            uint8x16_t x = vceqq_u8(s, s0);
+            x = vorrq_u8(x, vceqq_u8(s, s1));
+            x = vorrq_u8(x, vceqq_u8(s, s2));
+            x = vorrq_u8(x, vcltq_u8(s, s3));
+
+            x = vrev64q_u8(x);                     // Rev in 64
+            uint64_t low = vgetq_lane_u64(vreinterpretq_u64_u8(x), 0);   // extract
+            uint64_t high = vgetq_lane_u64(vreinterpretq_u64_u8(x), 1);  // extract
+
+            SizeType length = 0;
+            bool escaped = false;
+            if (low == 0) {
+                if (high != 0) {
+                    uint32_t lz = internal::clzll(high);
+                    length = 8 + (lz >> 3);
+                    escaped = true;
+                }
+            } else {
+                uint32_t lz = internal::clzll(low);
+                length = lz >> 3;
+                escaped = true;
+            }
+            if (RAPIDJSON_UNLIKELY(escaped)) {   // some of characters is escaped
+                if (length != 0) {
+                    char* q = reinterpret_cast<char*>(os.Push(length));
+                    for (size_t i = 0; i < length; i++)
+                        q[i] = p[i];
+
+                    p += length;
+                }
+                break;
+            }
+            vst1q_u8(reinterpret_cast<uint8_t *>(os.Push(16)), s);
+        }
+
+        is.src_ = p;
+    }
+
+    // InsituStringStream -> InsituStringStream
+    static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(InsituStringStream& is, InsituStringStream& os) {
+        RAPIDJSON_ASSERT(&is == &os);
+        (void)os;
+
+        if (is.src_ == is.dst_) {
+            SkipUnescapedString(is);
+            return;
+        }
+
+        char* p = is.src_;
+        char *q = is.dst_;
+
+        // Scan one by one until alignment (unaligned load may cross page boundary and cause crash)
+        const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+        while (p != nextAligned)
+            if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) {
+                is.src_ = p;
+                is.dst_ = q;
+                return;
+            }
+            else
+                *q++ = *p++;
+
+        // The rest of string using SIMD
+        const uint8x16_t s0 = vmovq_n_u8('"');
+        const uint8x16_t s1 = vmovq_n_u8('\\');
+        const uint8x16_t s2 = vmovq_n_u8('\b');
+        const uint8x16_t s3 = vmovq_n_u8(32);
+
+        for (;; p += 16, q += 16) {
+            const uint8x16_t s = vld1q_u8(reinterpret_cast<uint8_t *>(p));
+            uint8x16_t x = vceqq_u8(s, s0);
+            x = vorrq_u8(x, vceqq_u8(s, s1));
+            x = vorrq_u8(x, vceqq_u8(s, s2));
+            x = vorrq_u8(x, vcltq_u8(s, s3));
+
+            x = vrev64q_u8(x);                     // Rev in 64
+            uint64_t low = vgetq_lane_u64(vreinterpretq_u64_u8(x), 0);   // extract
+            uint64_t high = vgetq_lane_u64(vreinterpretq_u64_u8(x), 1);  // extract
+
+            SizeType length = 0;
+            bool escaped = false;
+            if (low == 0) {
+                if (high != 0) {
+                    uint32_t lz = internal::clzll(high);
+                    length = 8 + (lz >> 3);
+                    escaped = true;
+                }
+            } else {
+                uint32_t lz = internal::clzll(low);
+                length = lz >> 3;
+                escaped = true;
+            }
+            if (RAPIDJSON_UNLIKELY(escaped)) {   // some of characters is escaped
+                for (const char* pend = p + length; p != pend; ) {
+                    *q++ = *p++;
+                }
+                break;
+            }
+            vst1q_u8(reinterpret_cast<uint8_t *>(q), s);
+        }
+
+        is.src_ = p;
+        is.dst_ = q;
+    }
+
+    // When read/write pointers are the same for insitu stream, just skip unescaped characters
+    static RAPIDJSON_FORCEINLINE void SkipUnescapedString(InsituStringStream& is) {
+        RAPIDJSON_ASSERT(is.src_ == is.dst_);
+        char* p = is.src_;
+
+        // Scan one by one until alignment (unaligned load may cross page boundary and cause crash)
+        const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+        for (; p != nextAligned; p++)
+            if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) {
+                is.src_ = is.dst_ = p;
+                return;
+            }
+
+        // The rest of string using SIMD
+        const uint8x16_t s0 = vmovq_n_u8('"');
+        const uint8x16_t s1 = vmovq_n_u8('\\');
+        const uint8x16_t s2 = vmovq_n_u8('\b');
+        const uint8x16_t s3 = vmovq_n_u8(32);
+
+        for (;; p += 16) {
+            const uint8x16_t s = vld1q_u8(reinterpret_cast<uint8_t *>(p));
+            uint8x16_t x = vceqq_u8(s, s0);
+            x = vorrq_u8(x, vceqq_u8(s, s1));
+            x = vorrq_u8(x, vceqq_u8(s, s2));
+            x = vorrq_u8(x, vcltq_u8(s, s3));
+
+            x = vrev64q_u8(x);                     // Rev in 64
+            uint64_t low = vgetq_lane_u64(vreinterpretq_u64_u8(x), 0);   // extract
+            uint64_t high = vgetq_lane_u64(vreinterpretq_u64_u8(x), 1);  // extract
+
+            if (low == 0) {
+                if (high != 0) {
+                    uint32_t lz = internal::clzll(high);
+                    p += 8 + (lz >> 3);
+                    break;
+                }
+            } else {
+                uint32_t lz = internal::clzll(low);
+                p += lz >> 3;
+                break;
+            }
+        }
+
+        is.src_ = is.dst_ = p;
+    }
+#endif // RAPIDJSON_NEON
+
+    template<typename InputStream, typename StackCharacter, bool backup, bool pushOnTake>
+    class NumberStream;
+
+    template<typename InputStream, typename StackCharacter>
+    class NumberStream<InputStream, StackCharacter, false, false> {
+    public:
+        typedef typename InputStream::Ch Ch;
+
+        NumberStream(GenericReader& reader, InputStream& s) : is(s) { (void)reader;  }
+
+        RAPIDJSON_FORCEINLINE Ch Peek() const { return is.Peek(); }
+        RAPIDJSON_FORCEINLINE Ch TakePush() { return is.Take(); }
+        RAPIDJSON_FORCEINLINE Ch Take() { return is.Take(); }
+        RAPIDJSON_FORCEINLINE void Push(char) {}
+
+        size_t Tell() { return is.Tell(); }
+        size_t Length() { return 0; }
+        const StackCharacter* Pop() { return 0; }
+
+    protected:
+        NumberStream& operator=(const NumberStream&);
+
+        InputStream& is;
+    };
+
+    template<typename InputStream, typename StackCharacter>
+    class NumberStream<InputStream, StackCharacter, true, false> : public NumberStream<InputStream, StackCharacter, false, false> {
+        typedef NumberStream<InputStream, StackCharacter, false, false> Base;
+    public:
+        NumberStream(GenericReader& reader, InputStream& s) : Base(reader, s), stackStream(reader.stack_) {}
+
+        RAPIDJSON_FORCEINLINE Ch TakePush() {
+            stackStream.Put(static_cast<StackCharacter>(Base::is.Peek()));
+            return Base::is.Take();
+        }
+
+        RAPIDJSON_FORCEINLINE void Push(StackCharacter c) {
+            stackStream.Put(c);
+        }
+
+        size_t Length() { return stackStream.Length(); }
+
+        const StackCharacter* Pop() {
+            stackStream.Put('\0');
+            return stackStream.Pop();
+        }
+
+    private:
+        StackStream<StackCharacter> stackStream;
+    };
+
+    template<typename InputStream, typename StackCharacter>
+    class NumberStream<InputStream, StackCharacter, true, true> : public NumberStream<InputStream, StackCharacter, true, false> {
+        typedef NumberStream<InputStream, StackCharacter, true, false> Base;
+    public:
+        NumberStream(GenericReader& reader, InputStream& s) : Base(reader, s) {}
+
+        RAPIDJSON_FORCEINLINE Ch Take() { return Base::TakePush(); }
+    };
+
+    template<unsigned parseFlags, typename InputStream, typename Handler>
+    void ParseNumber(InputStream& is, Handler& handler) {
+        typedef typename internal::SelectIf<internal::BoolType<(parseFlags & kParseNumbersAsStringsFlag) != 0>, typename TargetEncoding::Ch, char>::Type NumberCharacter;
+
+        internal::StreamLocalCopy<InputStream> copy(is);
+        NumberStream<InputStream, NumberCharacter,
+            ((parseFlags & kParseNumbersAsStringsFlag) != 0) ?
+                ((parseFlags & kParseInsituFlag) == 0) :
+                ((parseFlags & kParseFullPrecisionFlag) != 0),
+            (parseFlags & kParseNumbersAsStringsFlag) != 0 &&
+                (parseFlags & kParseInsituFlag) == 0> s(*this, copy.s);
+
+        size_t startOffset = s.Tell();
+        double d = 0.0;
+        bool useNanOrInf = false;
+
+        // Parse minus
+        bool minus = Consume(s, '-');
+
+        // Parse int: zero / ( digit1-9 *DIGIT )
+        unsigned i = 0;
+        uint64_t i64 = 0;
+        bool use64bit = false;
+        int significandDigit = 0;
+        if (RAPIDJSON_UNLIKELY(s.Peek() == '0')) {
+            i = 0;
+            s.TakePush();
+        }
+        else if (RAPIDJSON_LIKELY(s.Peek() >= '1' && s.Peek() <= '9')) {
+            i = static_cast<unsigned>(s.TakePush() - '0');
+
+            if (minus)
+                while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                    if (RAPIDJSON_UNLIKELY(i >= 214748364)) { // 2^31 = 2147483648
+                        if (RAPIDJSON_LIKELY(i != 214748364 || s.Peek() > '8')) {
+                            i64 = i;
+                            use64bit = true;
+                            break;
+                        }
+                    }
+                    i = i * 10 + static_cast<unsigned>(s.TakePush() - '0');
+                    significandDigit++;
+                }
+            else
+                while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                    if (RAPIDJSON_UNLIKELY(i >= 429496729)) { // 2^32 - 1 = 4294967295
+                        if (RAPIDJSON_LIKELY(i != 429496729 || s.Peek() > '5')) {
+                            i64 = i;
+                            use64bit = true;
+                            break;
+                        }
+                    }
+                    i = i * 10 + static_cast<unsigned>(s.TakePush() - '0');
+                    significandDigit++;
+                }
+        }
+        // Parse NaN or Infinity here
+        else if ((parseFlags & kParseNanAndInfFlag) && RAPIDJSON_LIKELY((s.Peek() == 'I' || s.Peek() == 'N'))) {
+            if (Consume(s, 'N')) {
+                if (Consume(s, 'a') && Consume(s, 'N')) {
+                    d = std::numeric_limits<double>::quiet_NaN();
+                    useNanOrInf = true;
+                }
+            }
+            else if (RAPIDJSON_LIKELY(Consume(s, 'I'))) {
+                if (Consume(s, 'n') && Consume(s, 'f')) {
+                    d = (minus ? -std::numeric_limits<double>::infinity() : std::numeric_limits<double>::infinity());
+                    useNanOrInf = true;
+
+                    if (RAPIDJSON_UNLIKELY(s.Peek() == 'i' && !(Consume(s, 'i') && Consume(s, 'n')
+                                                                && Consume(s, 'i') && Consume(s, 't') && Consume(s, 'y')))) {
+                        RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, s.Tell());
+                    }
+                }
+            }
+
+            if (RAPIDJSON_UNLIKELY(!useNanOrInf)) {
+                RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, s.Tell());
+            }
+        }
+        else
+            RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, s.Tell());
+
+        // Parse 64bit int
+        bool useDouble = false;
+        if (use64bit) {
+            if (minus)
+                while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                     if (RAPIDJSON_UNLIKELY(i64 >= RAPIDJSON_UINT64_C2(0x0CCCCCCC, 0xCCCCCCCC))) // 2^63 = 9223372036854775808
+                        if (RAPIDJSON_LIKELY(i64 != RAPIDJSON_UINT64_C2(0x0CCCCCCC, 0xCCCCCCCC) || s.Peek() > '8')) {
+                            d = static_cast<double>(i64);
+                            useDouble = true;
+                            break;
+                        }
+                    i64 = i64 * 10 + static_cast<unsigned>(s.TakePush() - '0');
+                    significandDigit++;
+                }
+            else
+                while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                    if (RAPIDJSON_UNLIKELY(i64 >= RAPIDJSON_UINT64_C2(0x19999999, 0x99999999))) // 2^64 - 1 = 18446744073709551615
+                        if (RAPIDJSON_LIKELY(i64 != RAPIDJSON_UINT64_C2(0x19999999, 0x99999999) || s.Peek() > '5')) {
+                            d = static_cast<double>(i64);
+                            useDouble = true;
+                            break;
+                        }
+                    i64 = i64 * 10 + static_cast<unsigned>(s.TakePush() - '0');
+                    significandDigit++;
+                }
+        }
+
+        // Force double for big integer
+        if (useDouble) {
+            while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                d = d * 10 + (s.TakePush() - '0');
+            }
+        }
+
+        // Parse frac = decimal-point 1*DIGIT
+        int expFrac = 0;
+        size_t decimalPosition;
+        if (!useNanOrInf && Consume(s, '.')) {
+            decimalPosition = s.Length();
+
+            if (RAPIDJSON_UNLIKELY(!(s.Peek() >= '0' && s.Peek() <= '9')))
+                RAPIDJSON_PARSE_ERROR(kParseErrorNumberMissFraction, s.Tell());
+
+            if (!useDouble) {
+#if RAPIDJSON_64BIT
+                // Use i64 to store significand in 64-bit architecture
+                if (!use64bit)
+                    i64 = i;
+
+                while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                    if (i64 > RAPIDJSON_UINT64_C2(0x1FFFFF, 0xFFFFFFFF)) // 2^53 - 1 for fast path
+                        break;
+                    else {
+                        i64 = i64 * 10 + static_cast<unsigned>(s.TakePush() - '0');
+                        --expFrac;
+                        if (i64 != 0)
+                            significandDigit++;
+                    }
+                }
+
+                d = static_cast<double>(i64);
+#else
+                // Use double to store significand in 32-bit architecture
+                d = static_cast<double>(use64bit ? i64 : i);
+#endif
+                useDouble = true;
+            }
+
+            while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                if (significandDigit < 17) {
+                    d = d * 10.0 + (s.TakePush() - '0');
+                    --expFrac;
+                    if (RAPIDJSON_LIKELY(d > 0.0))
+                        significandDigit++;
+                }
+                else
+                    s.TakePush();
+            }
+        }
+        else
+            decimalPosition = s.Length(); // decimal position at the end of integer.
+
+        // Parse exp = e [ minus / plus ] 1*DIGIT
+        int exp = 0;
+        if (!useNanOrInf && (Consume(s, 'e') || Consume(s, 'E'))) {
+            if (!useDouble) {
+                d = static_cast<double>(use64bit ? i64 : i);
+                useDouble = true;
+            }
+
+            bool expMinus = false;
+            if (Consume(s, '+'))
+                ;
+            else if (Consume(s, '-'))
+                expMinus = true;
+
+            if (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                exp = static_cast<int>(s.Take() - '0');
+                if (expMinus) {
+                    // (exp + expFrac) must not underflow int => we're detecting when -exp gets
+                    // dangerously close to INT_MIN (a pessimistic next digit 9 would push it into
+                    // underflow territory):
+                    //
+                    //        -(exp * 10 + 9) + expFrac >= INT_MIN
+                    //   <=>  exp <= (expFrac - INT_MIN - 9) / 10
+                    RAPIDJSON_ASSERT(expFrac <= 0);
+                    int maxExp = (expFrac + 2147483639) / 10;
+
+                    while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                        exp = exp * 10 + static_cast<int>(s.Take() - '0');
+                        if (RAPIDJSON_UNLIKELY(exp > maxExp)) {
+                            while (RAPIDJSON_UNLIKELY(s.Peek() >= '0' && s.Peek() <= '9'))  // Consume the rest of exponent
+                                s.Take();
+                        }
+                    }
+                }
+                else {  // positive exp
+                    int maxExp = 308 - expFrac;
+                    while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) {
+                        exp = exp * 10 + static_cast<int>(s.Take() - '0');
+                        if (RAPIDJSON_UNLIKELY(exp > maxExp))
+                            RAPIDJSON_PARSE_ERROR(kParseErrorNumberTooBig, startOffset);
+                    }
+                }
+            }
+            else
+                RAPIDJSON_PARSE_ERROR(kParseErrorNumberMissExponent, s.Tell());
+
+            if (expMinus)
+                exp = -exp;
+        }
+
+        // Finish parsing, call event according to the type of number.
+        bool cont = true;
+
+        if (parseFlags & kParseNumbersAsStringsFlag) {
+            if (parseFlags & kParseInsituFlag) {
+                s.Pop();  // Pop stack no matter if it will be used or not.
+                typename InputStream::Ch* head = is.PutBegin();
+                const size_t length = s.Tell() - startOffset;
+                RAPIDJSON_ASSERT(length <= 0xFFFFFFFF);
+                // unable to insert the \0 character here, it will erase the comma after this number
+                const typename TargetEncoding::Ch* const str = reinterpret_cast<typename TargetEncoding::Ch*>(head);
+                cont = handler.RawNumber(str, SizeType(length), false);
+            }
+            else {
+                SizeType numCharsToCopy = static_cast<SizeType>(s.Length());
+                GenericStringStream<UTF8<NumberCharacter> > srcStream(s.Pop());
+                StackStream<typename TargetEncoding::Ch> dstStream(stack_);
+                while (numCharsToCopy--) {
+                    Transcoder<UTF8<typename TargetEncoding::Ch>, TargetEncoding>::Transcode(srcStream, dstStream);
+                }
+                dstStream.Put('\0');
+                const typename TargetEncoding::Ch* str = dstStream.Pop();
+                const SizeType length = static_cast<SizeType>(dstStream.Length()) - 1;
+                cont = handler.RawNumber(str, SizeType(length), true);
+            }
+        }
+        else {
+           size_t length = s.Length();
+           const NumberCharacter* decimal = s.Pop();  // Pop stack no matter if it will be used or not.
+
+           if (useDouble) {
+               int p = exp + expFrac;
+               if (parseFlags & kParseFullPrecisionFlag)
+                   d = internal::StrtodFullPrecision(d, p, decimal, length, decimalPosition, exp);
+               else
+                   d = internal::StrtodNormalPrecision(d, p);
+
+               // Use > max, instead of == inf, to fix bogus warning -Wfloat-equal
+               if (d > (std::numeric_limits<double>::max)()) {
+                   // Overflow
+                   // TODO: internal::StrtodX should report overflow (or underflow)
+                   RAPIDJSON_PARSE_ERROR(kParseErrorNumberTooBig, startOffset);
+               }
+
+               cont = handler.Double(minus ? -d : d);
+           }
+           else if (useNanOrInf) {
+               cont = handler.Double(d);
+           }
+           else {
+               if (use64bit) {
+                   if (minus)
+                       cont = handler.Int64(static_cast<int64_t>(~i64 + 1));
+                   else
+                       cont = handler.Uint64(i64);
+               }
+               else {
+                   if (minus)
+                       cont = handler.Int(static_cast<int32_t>(~i + 1));
+                   else
+                       cont = handler.Uint(i);
+               }
+           }
+        }
+        if (RAPIDJSON_UNLIKELY(!cont))
+            RAPIDJSON_PARSE_ERROR(kParseErrorTermination, startOffset);
+    }
+
+    // Parse any JSON value
+    template<unsigned parseFlags, typename InputStream, typename Handler>
+    void ParseValue(InputStream& is, Handler& handler) {
+        switch (is.Peek()) {
+            case 'n': ParseNull  <parseFlags>(is, handler); break;
+            case 't': ParseTrue  <parseFlags>(is, handler); break;
+            case 'f': ParseFalse <parseFlags>(is, handler); break;
+            case '"': ParseString<parseFlags>(is, handler); break;
+            case '{': ParseObject<parseFlags>(is, handler); break;
+            case '[': ParseArray <parseFlags>(is, handler); break;
+            default :
+                      ParseNumber<parseFlags>(is, handler);
+                      break;
+
+        }
+    }
+
+    // Iterative Parsing
+
+    // States
+    enum IterativeParsingState {
+        IterativeParsingFinishState = 0, // sink states at top
+        IterativeParsingErrorState,      // sink states at top
+        IterativeParsingStartState,
+
+        // Object states
+        IterativeParsingObjectInitialState,
+        IterativeParsingMemberKeyState,
+        IterativeParsingMemberValueState,
+        IterativeParsingObjectFinishState,
+
+        // Array states
+        IterativeParsingArrayInitialState,
+        IterativeParsingElementState,
+        IterativeParsingArrayFinishState,
+
+        // Single value state
+        IterativeParsingValueState,
+
+        // Delimiter states (at bottom)
+        IterativeParsingElementDelimiterState,
+        IterativeParsingMemberDelimiterState,
+        IterativeParsingKeyValueDelimiterState,
+
+        cIterativeParsingStateCount
+    };
+
+    // Tokens
+    enum Token {
+        LeftBracketToken = 0,
+        RightBracketToken,
+
+        LeftCurlyBracketToken,
+        RightCurlyBracketToken,
+
+        CommaToken,
+        ColonToken,
+
+        StringToken,
+        FalseToken,
+        TrueToken,
+        NullToken,
+        NumberToken,
+
+        kTokenCount
+    };
+
+    RAPIDJSON_FORCEINLINE Token Tokenize(Ch c) const {
+
+//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
+#define N NumberToken
+#define N16 N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N
+        // Maps from ASCII to Token
+        static const unsigned char tokenMap[256] = {
+            N16, // 00~0F
+            N16, // 10~1F
+            N, N, StringToken, N, N, N, N, N, N, N, N, N, CommaToken, N, N, N, // 20~2F
+            N, N, N, N, N, N, N, N, N, N, ColonToken, N, N, N, N, N, // 30~3F
+            N16, // 40~4F
+            N, N, N, N, N, N, N, N, N, N, N, LeftBracketToken, N, RightBracketToken, N, N, // 50~5F
+            N, N, N, N, N, N, FalseToken, N, N, N, N, N, N, N, NullToken, N, // 60~6F
+            N, N, N, N, TrueToken, N, N, N, N, N, N, LeftCurlyBracketToken, N, RightCurlyBracketToken, N, N, // 70~7F
+            N16, N16, N16, N16, N16, N16, N16, N16 // 80~FF
+        };
+#undef N
+#undef N16
+//!@endcond
+
+        if (sizeof(Ch) == 1 || static_cast<unsigned>(c) < 256)
+            return static_cast<Token>(tokenMap[static_cast<unsigned char>(c)]);
+        else
+            return NumberToken;
+    }
+
+    RAPIDJSON_FORCEINLINE IterativeParsingState Predict(IterativeParsingState state, Token token) const {
+        // current state x one lookahead token -> new state
+        static const char G[cIterativeParsingStateCount][kTokenCount] = {
+            // Finish(sink state)
+            {
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState
+            },
+            // Error(sink state)
+            {
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState
+            },
+            // Start
+            {
+                IterativeParsingArrayInitialState,  // Left bracket
+                IterativeParsingErrorState,         // Right bracket
+                IterativeParsingObjectInitialState, // Left curly bracket
+                IterativeParsingErrorState,         // Right curly bracket
+                IterativeParsingErrorState,         // Comma
+                IterativeParsingErrorState,         // Colon
+                IterativeParsingValueState,         // String
+                IterativeParsingValueState,         // False
+                IterativeParsingValueState,         // True
+                IterativeParsingValueState,         // Null
+                IterativeParsingValueState          // Number
+            },
+            // ObjectInitial
+            {
+                IterativeParsingErrorState,         // Left bracket
+                IterativeParsingErrorState,         // Right bracket
+                IterativeParsingErrorState,         // Left curly bracket
+                IterativeParsingObjectFinishState,  // Right curly bracket
+                IterativeParsingErrorState,         // Comma
+                IterativeParsingErrorState,         // Colon
+                IterativeParsingMemberKeyState,     // String
+                IterativeParsingErrorState,         // False
+                IterativeParsingErrorState,         // True
+                IterativeParsingErrorState,         // Null
+                IterativeParsingErrorState          // Number
+            },
+            // MemberKey
+            {
+                IterativeParsingErrorState,             // Left bracket
+                IterativeParsingErrorState,             // Right bracket
+                IterativeParsingErrorState,             // Left curly bracket
+                IterativeParsingErrorState,             // Right curly bracket
+                IterativeParsingErrorState,             // Comma
+                IterativeParsingKeyValueDelimiterState, // Colon
+                IterativeParsingErrorState,             // String
+                IterativeParsingErrorState,             // False
+                IterativeParsingErrorState,             // True
+                IterativeParsingErrorState,             // Null
+                IterativeParsingErrorState              // Number
+            },
+            // MemberValue
+            {
+                IterativeParsingErrorState,             // Left bracket
+                IterativeParsingErrorState,             // Right bracket
+                IterativeParsingErrorState,             // Left curly bracket
+                IterativeParsingObjectFinishState,      // Right curly bracket
+                IterativeParsingMemberDelimiterState,   // Comma
+                IterativeParsingErrorState,             // Colon
+                IterativeParsingErrorState,             // String
+                IterativeParsingErrorState,             // False
+                IterativeParsingErrorState,             // True
+                IterativeParsingErrorState,             // Null
+                IterativeParsingErrorState              // Number
+            },
+            // ObjectFinish(sink state)
+            {
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState
+            },
+            // ArrayInitial
+            {
+                IterativeParsingArrayInitialState,      // Left bracket(push Element state)
+                IterativeParsingArrayFinishState,       // Right bracket
+                IterativeParsingObjectInitialState,     // Left curly bracket(push Element state)
+                IterativeParsingErrorState,             // Right curly bracket
+                IterativeParsingErrorState,             // Comma
+                IterativeParsingErrorState,             // Colon
+                IterativeParsingElementState,           // String
+                IterativeParsingElementState,           // False
+                IterativeParsingElementState,           // True
+                IterativeParsingElementState,           // Null
+                IterativeParsingElementState            // Number
+            },
+            // Element
+            {
+                IterativeParsingErrorState,             // Left bracket
+                IterativeParsingArrayFinishState,       // Right bracket
+                IterativeParsingErrorState,             // Left curly bracket
+                IterativeParsingErrorState,             // Right curly bracket
+                IterativeParsingElementDelimiterState,  // Comma
+                IterativeParsingErrorState,             // Colon
+                IterativeParsingErrorState,             // String
+                IterativeParsingErrorState,             // False
+                IterativeParsingErrorState,             // True
+                IterativeParsingErrorState,             // Null
+                IterativeParsingErrorState              // Number
+            },
+            // ArrayFinish(sink state)
+            {
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState
+            },
+            // Single Value (sink state)
+            {
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState,
+                IterativeParsingErrorState
+            },
+            // ElementDelimiter
+            {
+                IterativeParsingArrayInitialState,      // Left bracket(push Element state)
+                IterativeParsingArrayFinishState,       // Right bracket
+                IterativeParsingObjectInitialState,     // Left curly bracket(push Element state)
+                IterativeParsingErrorState,             // Right curly bracket
+                IterativeParsingErrorState,             // Comma
+                IterativeParsingErrorState,             // Colon
+                IterativeParsingElementState,           // String
+                IterativeParsingElementState,           // False
+                IterativeParsingElementState,           // True
+                IterativeParsingElementState,           // Null
+                IterativeParsingElementState            // Number
+            },
+            // MemberDelimiter
+            {
+                IterativeParsingErrorState,         // Left bracket
+                IterativeParsingErrorState,         // Right bracket
+                IterativeParsingErrorState,         // Left curly bracket
+                IterativeParsingObjectFinishState,  // Right curly bracket
+                IterativeParsingErrorState,         // Comma
+                IterativeParsingErrorState,         // Colon
+                IterativeParsingMemberKeyState,     // String
+                IterativeParsingErrorState,         // False
+                IterativeParsingErrorState,         // True
+                IterativeParsingErrorState,         // Null
+                IterativeParsingErrorState          // Number
+            },
+            // KeyValueDelimiter
+            {
+                IterativeParsingArrayInitialState,      // Left bracket(push MemberValue state)
+                IterativeParsingErrorState,             // Right bracket
+                IterativeParsingObjectInitialState,     // Left curly bracket(push MemberValue state)
+                IterativeParsingErrorState,             // Right curly bracket
+                IterativeParsingErrorState,             // Comma
+                IterativeParsingErrorState,             // Colon
+                IterativeParsingMemberValueState,       // String
+                IterativeParsingMemberValueState,       // False
+                IterativeParsingMemberValueState,       // True
+                IterativeParsingMemberValueState,       // Null
+                IterativeParsingMemberValueState        // Number
+            },
+        }; // End of G
+
+        return static_cast<IterativeParsingState>(G[state][token]);
+    }
+
+    // Make an advance in the token stream and state based on the candidate destination state which was returned by Transit().
+    // May return a new state on state pop.
+    template <unsigned parseFlags, typename InputStream, typename Handler>
+    RAPIDJSON_FORCEINLINE IterativeParsingState Transit(IterativeParsingState src, Token token, IterativeParsingState dst, InputStream& is, Handler& handler) {
+        (void)token;
+
+        switch (dst) {
+        case IterativeParsingErrorState:
+            return dst;
+
+        case IterativeParsingObjectInitialState:
+        case IterativeParsingArrayInitialState:
+        {
+            // Push the state(Element or MemeberValue) if we are nested in another array or value of member.
+            // In this way we can get the correct state on ObjectFinish or ArrayFinish by frame pop.
+            IterativeParsingState n = src;
+            if (src == IterativeParsingArrayInitialState || src == IterativeParsingElementDelimiterState)
+                n = IterativeParsingElementState;
+            else if (src == IterativeParsingKeyValueDelimiterState)
+                n = IterativeParsingMemberValueState;
+            // Push current state.
+            *stack_.template Push<SizeType>(1) = n;
+            // Initialize and push the member/element count.
+            *stack_.template Push<SizeType>(1) = 0;
+            // Call handler
+            bool hr = (dst == IterativeParsingObjectInitialState) ? handler.StartObject() : handler.StartArray();
+            // On handler short circuits the parsing.
+            if (!hr) {
+                RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorTermination, is.Tell());
+                return IterativeParsingErrorState;
+            }
+            else {
+                is.Take();
+                return dst;
+            }
+        }
+
+        case IterativeParsingMemberKeyState:
+            ParseString<parseFlags>(is, handler, true);
+            if (HasParseError())
+                return IterativeParsingErrorState;
+            else
+                return dst;
+
+        case IterativeParsingKeyValueDelimiterState:
+            RAPIDJSON_ASSERT(token == ColonToken);
+            is.Take();
+            return dst;
+
+        case IterativeParsingMemberValueState:
+            // Must be non-compound value. Or it would be ObjectInitial or ArrayInitial state.
+            ParseValue<parseFlags>(is, handler);
+            if (HasParseError()) {
+                return IterativeParsingErrorState;
+            }
+            return dst;
+
+        case IterativeParsingElementState:
+            // Must be non-compound value. Or it would be ObjectInitial or ArrayInitial state.
+            ParseValue<parseFlags>(is, handler);
+            if (HasParseError()) {
+                return IterativeParsingErrorState;
+            }
+            return dst;
+
+        case IterativeParsingMemberDelimiterState:
+        case IterativeParsingElementDelimiterState:
+            is.Take();
+            // Update member/element count.
+            *stack_.template Top<SizeType>() = *stack_.template Top<SizeType>() + 1;
+            return dst;
+
+        case IterativeParsingObjectFinishState:
+        {
+            // Transit from delimiter is only allowed when trailing commas are enabled
+            if (!(parseFlags & kParseTrailingCommasFlag) && src == IterativeParsingMemberDelimiterState) {
+                RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorObjectMissName, is.Tell());
+                return IterativeParsingErrorState;
+            }
+            // Get member count.
+            SizeType c = *stack_.template Pop<SizeType>(1);
+            // If the object is not empty, count the last member.
+            if (src == IterativeParsingMemberValueState)
+                ++c;
+            // Restore the state.
+            IterativeParsingState n = static_cast<IterativeParsingState>(*stack_.template Pop<SizeType>(1));
+            // Transit to Finish state if this is the topmost scope.
+            if (n == IterativeParsingStartState)
+                n = IterativeParsingFinishState;
+            // Call handler
+            bool hr = handler.EndObject(c);
+            // On handler short circuits the parsing.
+            if (!hr) {
+                RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorTermination, is.Tell());
+                return IterativeParsingErrorState;
+            }
+            else {
+                is.Take();
+                return n;
+            }
+        }
+
+        case IterativeParsingArrayFinishState:
+        {
+            // Transit from delimiter is only allowed when trailing commas are enabled
+            if (!(parseFlags & kParseTrailingCommasFlag) && src == IterativeParsingElementDelimiterState) {
+                RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorValueInvalid, is.Tell());
+                return IterativeParsingErrorState;
+            }
+            // Get element count.
+            SizeType c = *stack_.template Pop<SizeType>(1);
+            // If the array is not empty, count the last element.
+            if (src == IterativeParsingElementState)
+                ++c;
+            // Restore the state.
+            IterativeParsingState n = static_cast<IterativeParsingState>(*stack_.template Pop<SizeType>(1));
+            // Transit to Finish state if this is the topmost scope.
+            if (n == IterativeParsingStartState)
+                n = IterativeParsingFinishState;
+            // Call handler
+            bool hr = handler.EndArray(c);
+            // On handler short circuits the parsing.
+            if (!hr) {
+                RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorTermination, is.Tell());
+                return IterativeParsingErrorState;
+            }
+            else {
+                is.Take();
+                return n;
+            }
+        }
+
+        default:
+            // This branch is for IterativeParsingValueState actually.
+            // Use `default:` rather than
+            // `case IterativeParsingValueState:` is for code coverage.
+
+            // The IterativeParsingStartState is not enumerated in this switch-case.
+            // It is impossible for that case. And it can be caught by following assertion.
+
+            // The IterativeParsingFinishState is not enumerated in this switch-case either.
+            // It is a "derivative" state which cannot triggered from Predict() directly.
+            // Therefore it cannot happen here. And it can be caught by following assertion.
+            RAPIDJSON_ASSERT(dst == IterativeParsingValueState);
+
+            // Must be non-compound value. Or it would be ObjectInitial or ArrayInitial state.
+            ParseValue<parseFlags>(is, handler);
+            if (HasParseError()) {
+                return IterativeParsingErrorState;
+            }
+            return IterativeParsingFinishState;
+        }
+    }
+
+    template <typename InputStream>
+    void HandleError(IterativeParsingState src, InputStream& is) {
+        if (HasParseError()) {
+            // Error flag has been set.
+            return;
+        }
+
+        switch (src) {
+        case IterativeParsingStartState:            RAPIDJSON_PARSE_ERROR(kParseErrorDocumentEmpty, is.Tell()); return;
+        case IterativeParsingFinishState:           RAPIDJSON_PARSE_ERROR(kParseErrorDocumentRootNotSingular, is.Tell()); return;
+        case IterativeParsingObjectInitialState:
+        case IterativeParsingMemberDelimiterState:  RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissName, is.Tell()); return;
+        case IterativeParsingMemberKeyState:        RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissColon, is.Tell()); return;
+        case IterativeParsingMemberValueState:      RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissCommaOrCurlyBracket, is.Tell()); return;
+        case IterativeParsingKeyValueDelimiterState:
+        case IterativeParsingArrayInitialState:
+        case IterativeParsingElementDelimiterState: RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell()); return;
+        default: RAPIDJSON_ASSERT(src == IterativeParsingElementState); RAPIDJSON_PARSE_ERROR(kParseErrorArrayMissCommaOrSquareBracket, is.Tell()); return;
+        }
+    }
+
+    RAPIDJSON_FORCEINLINE bool IsIterativeParsingDelimiterState(IterativeParsingState s) const {
+        return s >= IterativeParsingElementDelimiterState;
+    }
+
+    RAPIDJSON_FORCEINLINE bool IsIterativeParsingCompleteState(IterativeParsingState s) const {
+        return s <= IterativeParsingErrorState;
+    }
+
+    template <unsigned parseFlags, typename InputStream, typename Handler>
+    ParseResult IterativeParse(InputStream& is, Handler& handler) {
+        parseResult_.Clear();
+        ClearStackOnExit scope(*this);
+        IterativeParsingState state = IterativeParsingStartState;
+
+        SkipWhitespaceAndComments<parseFlags>(is);
+        RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_);
+        while (is.Peek() != '\0') {
+            Token t = Tokenize(is.Peek());
+            IterativeParsingState n = Predict(state, t);
+            IterativeParsingState d = Transit<parseFlags>(state, t, n, is, handler);
+
+            if (d == IterativeParsingErrorState) {
+                HandleError(state, is);
+                break;
+            }
+
+            state = d;
+
+            // Do not further consume streams if a root JSON has been parsed.
+            if ((parseFlags & kParseStopWhenDoneFlag) && state == IterativeParsingFinishState)
+                break;
+
+            SkipWhitespaceAndComments<parseFlags>(is);
+            RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_);
+        }
+
+        // Handle the end of file.
+        if (state != IterativeParsingFinishState)
+            HandleError(state, is);
+
+        return parseResult_;
+    }
+
+    static const size_t kDefaultStackCapacity = 256;    //!< Default stack capacity in bytes for storing a single decoded string.
+    internal::Stack<StackAllocator> stack_;  //!< A stack for storing decoded string temporarily during non-destructive parsing.
+    ParseResult parseResult_;
+    IterativeParsingState state_;
+}; // class GenericReader
+
+//! Reader with UTF8 encoding and default allocator.
+typedef GenericReader<UTF8<>, UTF8<> > Reader;
+
+RAPIDJSON_NAMESPACE_END
+
+#if defined(__clang__) || defined(_MSC_VER)
+RAPIDJSON_DIAG_POP
+#endif
+
+
+#ifdef __GNUC__
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_READER_H_
diff --git a/include/rapidjson/schema.h b/include/rapidjson/schema.h
new file mode 100644
index 0000000000..f049285f4e
--- /dev/null
+++ b/include/rapidjson/schema.h
@@ -0,0 +1,3261 @@
+// Tencent is pleased to support the open source community by making RapidJSON available->
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip-> All rights reserved->
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License-> You may obtain a copy of the License at
+//
+// http://opensource->org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied-> See the License for the
+// specific language governing permissions and limitations under the License->
+
+#ifndef RAPIDJSON_SCHEMA_H_
+#define RAPIDJSON_SCHEMA_H_
+
+#include "document.h"
+#include "pointer.h"
+#include "stringbuffer.h"
+#include "error/en.h"
+#include "uri.h"
+#include <cmath> // abs, floor
+
+#if !defined(RAPIDJSON_SCHEMA_USE_INTERNALREGEX)
+#define RAPIDJSON_SCHEMA_USE_INTERNALREGEX 1
+#endif
+
+#if !defined(RAPIDJSON_SCHEMA_USE_STDREGEX) || !(__cplusplus >=201103L || (defined(_MSC_VER) && _MSC_VER >= 1800))
+#define RAPIDJSON_SCHEMA_USE_STDREGEX 0
+#endif
+
+#if RAPIDJSON_SCHEMA_USE_INTERNALREGEX
+#include "internal/regex.h"
+#elif RAPIDJSON_SCHEMA_USE_STDREGEX
+#include <regex>
+#endif
+
+#if RAPIDJSON_SCHEMA_USE_INTERNALREGEX || RAPIDJSON_SCHEMA_USE_STDREGEX
+#define RAPIDJSON_SCHEMA_HAS_REGEX 1
+#else
+#define RAPIDJSON_SCHEMA_HAS_REGEX 0
+#endif
+
+#ifndef RAPIDJSON_SCHEMA_VERBOSE
+#define RAPIDJSON_SCHEMA_VERBOSE 0
+#endif
+
+RAPIDJSON_DIAG_PUSH
+
+#if defined(__GNUC__)
+RAPIDJSON_DIAG_OFF(effc++)
+#endif
+
+#ifdef __clang__
+RAPIDJSON_DIAG_OFF(weak-vtables)
+RAPIDJSON_DIAG_OFF(exit-time-destructors)
+RAPIDJSON_DIAG_OFF(c++98-compat-pedantic)
+RAPIDJSON_DIAG_OFF(variadic-macros)
+#elif defined(_MSC_VER)
+RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+// Verbose Utilities
+
+#if RAPIDJSON_SCHEMA_VERBOSE
+
+namespace internal {
+
+inline void PrintInvalidKeywordData(const char* keyword) {
+    printf("    Fail keyword: '%s'\n", keyword);
+}
+
+inline void PrintInvalidKeywordData(const wchar_t* keyword) {
+    wprintf(L"    Fail keyword: '%ls'\n", keyword);
+}
+
+inline void PrintInvalidDocumentData(const char* document) {
+    printf("    Fail document: '%s'\n", document);
+}
+
+inline void PrintInvalidDocumentData(const wchar_t* document) {
+    wprintf(L"    Fail document: '%ls'\n", document);
+}
+
+inline void PrintValidatorPointersData(const char* s, const char* d, unsigned depth) {
+    printf("    Sch: %*s'%s'\n    Doc: %*s'%s'\n", depth * 4, " ", s, depth * 4, " ", d);
+}
+
+inline void PrintValidatorPointersData(const wchar_t* s, const wchar_t* d, unsigned depth) {
+    wprintf(L"    Sch: %*ls'%ls'\n    Doc: %*ls'%ls'\n", depth * 4, L" ", s, depth * 4, L" ", d);
+}
+
+inline void PrintSchemaIdsData(const char* base, const char* local, const char* resolved) {
+    printf("    Resolving id: Base: '%s', Local: '%s', Resolved: '%s'\n", base, local, resolved);
+}
+
+inline void PrintSchemaIdsData(const wchar_t* base, const wchar_t* local, const wchar_t* resolved) {
+    wprintf(L"    Resolving id: Base: '%ls', Local: '%ls', Resolved: '%ls'\n", base, local, resolved);
+}
+
+inline void PrintMethodData(const char* method) {
+    printf("%s\n", method);
+}
+
+inline void PrintMethodData(const char* method, bool b) {
+    printf("%s, Data: '%s'\n", method, b ? "true" : "false");
+}
+
+inline void PrintMethodData(const char* method, int64_t i) {
+    printf("%s, Data: '%" PRId64 "'\n", method, i);
+}
+
+inline void PrintMethodData(const char* method, uint64_t u) {
+    printf("%s, Data: '%" PRIu64 "'\n", method, u);
+}
+
+inline void PrintMethodData(const char* method, double d) {
+    printf("%s, Data: '%lf'\n", method, d);
+}
+
+inline void PrintMethodData(const char* method, const char* s) {
+    printf("%s, Data: '%s'\n", method, s);
+}
+
+inline void PrintMethodData(const char* method, const wchar_t* s) {
+    wprintf(L"%hs, Data: '%ls'\n", method, s);
+}
+
+inline void PrintMethodData(const char* method, const char* s1, const char* s2) {
+    printf("%s, Data: '%s', '%s'\n", method, s1, s2);
+}
+
+inline void PrintMethodData(const char* method, const wchar_t* s1, const wchar_t* s2) {
+    wprintf(L"%hs, Data: '%ls', '%ls'\n", method, s1, s2);
+}
+
+} // namespace internal
+
+#endif // RAPIDJSON_SCHEMA_VERBOSE
+
+#ifndef RAPIDJSON_SCHEMA_PRINT
+#if RAPIDJSON_SCHEMA_VERBOSE
+#define RAPIDJSON_SCHEMA_PRINT(name, ...) internal::Print##name##Data(__VA_ARGS__)
+#else
+#define RAPIDJSON_SCHEMA_PRINT(name, ...)
+#endif
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// RAPIDJSON_INVALID_KEYWORD_RETURN
+
+#define RAPIDJSON_INVALID_KEYWORD_RETURN(code)\
+RAPIDJSON_MULTILINEMACRO_BEGIN\
+    context.invalidCode = code;\
+    context.invalidKeyword = SchemaType::GetValidateErrorKeyword(code).GetString();\
+    RAPIDJSON_SCHEMA_PRINT(InvalidKeyword, context.invalidKeyword);\
+    return false;\
+RAPIDJSON_MULTILINEMACRO_END
+
+///////////////////////////////////////////////////////////////////////////////
+// ValidateFlag
+
+/*! \def RAPIDJSON_VALIDATE_DEFAULT_FLAGS
+    \ingroup RAPIDJSON_CONFIG
+    \brief User-defined kValidateDefaultFlags definition.
+
+    User can define this as any \c ValidateFlag combinations.
+*/
+#ifndef RAPIDJSON_VALIDATE_DEFAULT_FLAGS
+#define RAPIDJSON_VALIDATE_DEFAULT_FLAGS kValidateNoFlags
+#endif
+
+//! Combination of validate flags
+enum ValidateFlag {
+    kValidateNoFlags = 0,                                       //!< No flags are set.
+    kValidateContinueOnErrorFlag = 1,                           //!< Don't stop after first validation error.
+    kValidateReadFlag = 2,                                      //!< Validation is for a read semantic.
+    kValidateWriteFlag = 4,                                     //!< Validation is for a write semantic.
+    kValidateDefaultFlags = RAPIDJSON_VALIDATE_DEFAULT_FLAGS    //!< Default validate flags. Can be customized by defining RAPIDJSON_VALIDATE_DEFAULT_FLAGS
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Specification
+enum SchemaDraft {
+    kDraftUnknown = -1,
+    kDraftNone = 0,
+    kDraft03 = 3,
+    kDraftMin = 4,                       //!< Current minimum supported draft
+    kDraft04 = 4,
+    kDraft05 = 5,
+    kDraftMax = 5,                       //!< Current maximum supported draft
+    kDraft06 = 6,
+    kDraft07 = 7,
+    kDraft2019_09 = 8,
+    kDraft2020_12 = 9
+};
+
+enum OpenApiVersion {
+    kVersionUnknown = -1,
+    kVersionNone = 0,
+    kVersionMin = 2,                      //!< Current minimum supported version
+    kVersion20 = 2,
+    kVersion30 = 3,
+    kVersionMax = 3,                      //!< Current maximum supported version
+    kVersion31 = 4,
+};
+
+struct Specification {
+    Specification(SchemaDraft d) : draft(d), oapi(kVersionNone) {}
+    Specification(OpenApiVersion o) : oapi(o) {
+        if (oapi == kVersion20) draft = kDraft04;
+        else if (oapi == kVersion30) draft = kDraft05;
+        else if (oapi == kVersion31) draft = kDraft2020_12;
+        else draft = kDraft04;
+    }
+    ~Specification() {}
+    bool IsSupported() const {
+        return ((draft >= kDraftMin && draft <= kDraftMax) && ((oapi == kVersionNone) || (oapi >= kVersionMin && oapi <= kVersionMax)));
+    }
+    SchemaDraft draft;
+    OpenApiVersion oapi;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Forward declarations
+
+template <typename ValueType, typename Allocator>
+class GenericSchemaDocument;
+
+namespace internal {
+
+template <typename SchemaDocumentType>
+class Schema;
+
+///////////////////////////////////////////////////////////////////////////////
+// ISchemaValidator
+
+class ISchemaValidator {
+public:
+    virtual ~ISchemaValidator() {}
+    virtual bool IsValid() const = 0;
+    virtual void SetValidateFlags(unsigned flags) = 0;
+    virtual unsigned GetValidateFlags() const = 0;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// ISchemaStateFactory
+
+template <typename SchemaType>
+class ISchemaStateFactory {
+public:
+    virtual ~ISchemaStateFactory() {}
+    virtual ISchemaValidator* CreateSchemaValidator(const SchemaType&, const bool inheritContinueOnErrors) = 0;
+    virtual void DestroySchemaValidator(ISchemaValidator* validator) = 0;
+    virtual void* CreateHasher() = 0;
+    virtual uint64_t GetHashCode(void* hasher) = 0;
+    virtual void DestroryHasher(void* hasher) = 0;
+    virtual void* MallocState(size_t size) = 0;
+    virtual void FreeState(void* p) = 0;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// IValidationErrorHandler
+
+template <typename SchemaType>
+class IValidationErrorHandler {
+public:
+    typedef typename SchemaType::Ch Ch;
+    typedef typename SchemaType::SValue SValue;
+
+    virtual ~IValidationErrorHandler() {}
+
+    virtual void NotMultipleOf(int64_t actual, const SValue& expected) = 0;
+    virtual void NotMultipleOf(uint64_t actual, const SValue& expected) = 0;
+    virtual void NotMultipleOf(double actual, const SValue& expected) = 0;
+    virtual void AboveMaximum(int64_t actual, const SValue& expected, bool exclusive) = 0;
+    virtual void AboveMaximum(uint64_t actual, const SValue& expected, bool exclusive) = 0;
+    virtual void AboveMaximum(double actual, const SValue& expected, bool exclusive) = 0;
+    virtual void BelowMinimum(int64_t actual, const SValue& expected, bool exclusive) = 0;
+    virtual void BelowMinimum(uint64_t actual, const SValue& expected, bool exclusive) = 0;
+    virtual void BelowMinimum(double actual, const SValue& expected, bool exclusive) = 0;
+
+    virtual void TooLong(const Ch* str, SizeType length, SizeType expected) = 0;
+    virtual void TooShort(const Ch* str, SizeType length, SizeType expected) = 0;
+    virtual void DoesNotMatch(const Ch* str, SizeType length) = 0;
+
+    virtual void DisallowedItem(SizeType index) = 0;
+    virtual void TooFewItems(SizeType actualCount, SizeType expectedCount) = 0;
+    virtual void TooManyItems(SizeType actualCount, SizeType expectedCount) = 0;
+    virtual void DuplicateItems(SizeType index1, SizeType index2) = 0;
+
+    virtual void TooManyProperties(SizeType actualCount, SizeType expectedCount) = 0;
+    virtual void TooFewProperties(SizeType actualCount, SizeType expectedCount) = 0;
+    virtual void StartMissingProperties() = 0;
+    virtual void AddMissingProperty(const SValue& name) = 0;
+    virtual bool EndMissingProperties() = 0;
+    virtual void PropertyViolations(ISchemaValidator** subvalidators, SizeType count) = 0;
+    virtual void DisallowedProperty(const Ch* name, SizeType length) = 0;
+
+    virtual void StartDependencyErrors() = 0;
+    virtual void StartMissingDependentProperties() = 0;
+    virtual void AddMissingDependentProperty(const SValue& targetName) = 0;
+    virtual void EndMissingDependentProperties(const SValue& sourceName) = 0;
+    virtual void AddDependencySchemaError(const SValue& souceName, ISchemaValidator* subvalidator) = 0;
+    virtual bool EndDependencyErrors() = 0;
+
+    virtual void DisallowedValue(const ValidateErrorCode code) = 0;
+    virtual void StartDisallowedType() = 0;
+    virtual void AddExpectedType(const typename SchemaType::ValueType& expectedType) = 0;
+    virtual void EndDisallowedType(const typename SchemaType::ValueType& actualType) = 0;
+    virtual void NotAllOf(ISchemaValidator** subvalidators, SizeType count) = 0;
+    virtual void NoneOf(ISchemaValidator** subvalidators, SizeType count) = 0;
+    virtual void NotOneOf(ISchemaValidator** subvalidators, SizeType count) = 0;
+    virtual void MultipleOneOf(SizeType index1, SizeType index2) = 0;
+    virtual void Disallowed() = 0;
+    virtual void DisallowedWhenWriting() = 0;
+    virtual void DisallowedWhenReading() = 0;
+};
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Hasher
+
+// For comparison of compound value
+template<typename Encoding, typename Allocator>
+class Hasher {
+public:
+    typedef typename Encoding::Ch Ch;
+
+    Hasher(Allocator* allocator = 0, size_t stackCapacity = kDefaultSize) : stack_(allocator, stackCapacity) {}
+
+    bool Null() { return WriteType(kNullType); }
+    bool Bool(bool b) { return WriteType(b ? kTrueType : kFalseType); }
+    bool Int(int i) { Number n; n.u.i = i; n.d = static_cast<double>(i); return WriteNumber(n); }
+    bool Uint(unsigned u) { Number n; n.u.u = u; n.d = static_cast<double>(u); return WriteNumber(n); }
+    bool Int64(int64_t i) { Number n; n.u.i = i; n.d = static_cast<double>(i); return WriteNumber(n); }
+    bool Uint64(uint64_t u) { Number n; n.u.u = u; n.d = static_cast<double>(u); return WriteNumber(n); }
+    bool Double(double d) {
+        Number n;
+        if (d < 0) n.u.i = static_cast<int64_t>(d);
+        else       n.u.u = static_cast<uint64_t>(d);
+        n.d = d;
+        return WriteNumber(n);
+    }
+
+    bool RawNumber(const Ch* str, SizeType len, bool) {
+        WriteBuffer(kNumberType, str, len * sizeof(Ch));
+        return true;
+    }
+
+    bool String(const Ch* str, SizeType len, bool) {
+        WriteBuffer(kStringType, str, len * sizeof(Ch));
+        return true;
+    }
+
+    bool StartObject() { return true; }
+    bool Key(const Ch* str, SizeType len, bool copy) { return String(str, len, copy); }
+    bool EndObject(SizeType memberCount) { 
+        uint64_t h = Hash(0, kObjectType);
+        uint64_t* kv = stack_.template Pop<uint64_t>(memberCount * 2);
+        for (SizeType i = 0; i < memberCount; i++)
+            // Issue #2205
+            // Hasing the key to avoid key=value cases with bug-prone zero-value hash
+            h ^= Hash(Hash(0, kv[i * 2]), kv[i * 2 + 1]);  // Use xor to achieve member order insensitive
+        *stack_.template Push<uint64_t>() = h;
+        return true;
+    }
+    
+    bool StartArray() { return true; }
+    bool EndArray(SizeType elementCount) { 
+        uint64_t h = Hash(0, kArrayType);
+        uint64_t* e = stack_.template Pop<uint64_t>(elementCount);
+        for (SizeType i = 0; i < elementCount; i++)
+            h = Hash(h, e[i]); // Use hash to achieve element order sensitive
+        *stack_.template Push<uint64_t>() = h;
+        return true;
+    }
+
+    bool IsValid() const { return stack_.GetSize() == sizeof(uint64_t); }
+
+    uint64_t GetHashCode() const {
+        RAPIDJSON_ASSERT(IsValid());
+        return *stack_.template Top<uint64_t>();
+    }
+
+private:
+    static const size_t kDefaultSize = 256;
+    struct Number {
+        union U {
+            uint64_t u;
+            int64_t i;
+        }u;
+        double d;
+    };
+
+    bool WriteType(Type type) { return WriteBuffer(type, 0, 0); }
+    
+    bool WriteNumber(const Number& n) { return WriteBuffer(kNumberType, &n, sizeof(n)); }
+    
+    bool WriteBuffer(Type type, const void* data, size_t len) {
+        // FNV-1a from http://isthe.com/chongo/tech/comp/fnv/
+        uint64_t h = Hash(RAPIDJSON_UINT64_C2(0xcbf29ce4, 0x84222325), type);
+        const unsigned char* d = static_cast<const unsigned char*>(data);
+        for (size_t i = 0; i < len; i++)
+            h = Hash(h, d[i]);
+        *stack_.template Push<uint64_t>() = h;
+        return true;
+    }
+
+    static uint64_t Hash(uint64_t h, uint64_t d) {
+        static const uint64_t kPrime = RAPIDJSON_UINT64_C2(0x00000100, 0x000001b3);
+        h ^= d;
+        h *= kPrime;
+        return h;
+    }
+
+    Stack<Allocator> stack_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// SchemaValidationContext
+
+template <typename SchemaDocumentType>
+struct SchemaValidationContext {
+    typedef Schema<SchemaDocumentType> SchemaType;
+    typedef ISchemaStateFactory<SchemaType> SchemaValidatorFactoryType;
+    typedef IValidationErrorHandler<SchemaType> ErrorHandlerType;
+    typedef typename SchemaType::ValueType ValueType;
+    typedef typename ValueType::Ch Ch;
+
+    enum PatternValidatorType {
+        kPatternValidatorOnly,
+        kPatternValidatorWithProperty,
+        kPatternValidatorWithAdditionalProperty
+    };
+
+    SchemaValidationContext(SchemaValidatorFactoryType& f, ErrorHandlerType& eh, const SchemaType* s, unsigned fl = 0) :
+        factory(f),
+        error_handler(eh),
+        schema(s),
+        flags(fl),
+        valueSchema(),
+        invalidKeyword(),
+        invalidCode(),
+        hasher(),
+        arrayElementHashCodes(),
+        validators(),
+        validatorCount(),
+        patternPropertiesValidators(),
+        patternPropertiesValidatorCount(),
+        patternPropertiesSchemas(),
+        patternPropertiesSchemaCount(),
+        valuePatternValidatorType(kPatternValidatorOnly),
+        propertyExist(),
+        inArray(false),
+        valueUniqueness(false),
+        arrayUniqueness(false)
+    {
+    }
+
+    ~SchemaValidationContext() {
+        if (hasher)
+            factory.DestroryHasher(hasher);
+        if (validators) {
+            for (SizeType i = 0; i < validatorCount; i++) {
+                if (validators[i]) {
+                    factory.DestroySchemaValidator(validators[i]);
+                }
+            }
+            factory.FreeState(validators);
+        }
+        if (patternPropertiesValidators) {
+            for (SizeType i = 0; i < patternPropertiesValidatorCount; i++) {
+                if (patternPropertiesValidators[i]) {
+                    factory.DestroySchemaValidator(patternPropertiesValidators[i]);
+                }
+            }
+            factory.FreeState(patternPropertiesValidators);
+        }
+        if (patternPropertiesSchemas)
+            factory.FreeState(patternPropertiesSchemas);
+        if (propertyExist)
+            factory.FreeState(propertyExist);
+    }
+
+    SchemaValidatorFactoryType& factory;
+    ErrorHandlerType& error_handler;
+    const SchemaType* schema;
+    unsigned flags;
+    const SchemaType* valueSchema;
+    const Ch* invalidKeyword;
+    ValidateErrorCode invalidCode;
+    void* hasher; // Only validator access
+    void* arrayElementHashCodes; // Only validator access this
+    ISchemaValidator** validators;
+    SizeType validatorCount;
+    ISchemaValidator** patternPropertiesValidators;
+    SizeType patternPropertiesValidatorCount;
+    const SchemaType** patternPropertiesSchemas;
+    SizeType patternPropertiesSchemaCount;
+    PatternValidatorType valuePatternValidatorType;
+    PatternValidatorType objectPatternValidatorType;
+    SizeType arrayElementIndex;
+    bool* propertyExist;
+    bool inArray;
+    bool valueUniqueness;
+    bool arrayUniqueness;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Schema
+
+template <typename SchemaDocumentType>
+class Schema {
+public:
+    typedef typename SchemaDocumentType::ValueType ValueType;
+    typedef typename SchemaDocumentType::AllocatorType AllocatorType;
+    typedef typename SchemaDocumentType::PointerType PointerType;
+    typedef typename ValueType::EncodingType EncodingType;
+    typedef typename EncodingType::Ch Ch;
+    typedef SchemaValidationContext<SchemaDocumentType> Context;
+    typedef Schema<SchemaDocumentType> SchemaType;
+    typedef GenericValue<EncodingType, AllocatorType> SValue;
+    typedef IValidationErrorHandler<Schema> ErrorHandler;
+    typedef GenericUri<ValueType, AllocatorType> UriType;
+    friend class GenericSchemaDocument<ValueType, AllocatorType>;
+
+    Schema(SchemaDocumentType* schemaDocument, const PointerType& p, const ValueType& value, const ValueType& document, AllocatorType* allocator, const UriType& id = UriType()) :
+        allocator_(allocator),
+        uri_(schemaDocument->GetURI(), *allocator),
+        id_(id, allocator),
+        spec_(schemaDocument->GetSpecification()),
+        pointer_(p, allocator),
+        typeless_(schemaDocument->GetTypeless()),
+        enum_(),
+        enumCount_(),
+        not_(),
+        type_((1 << kTotalSchemaType) - 1), // typeless
+        validatorCount_(),
+        notValidatorIndex_(),
+        properties_(),
+        additionalPropertiesSchema_(),
+        patternProperties_(),
+        patternPropertyCount_(),
+        propertyCount_(),
+        minProperties_(),
+        maxProperties_(SizeType(~0)),
+        additionalProperties_(true),
+        hasDependencies_(),
+        hasRequired_(),
+        hasSchemaDependencies_(),
+        additionalItemsSchema_(),
+        itemsList_(),
+        itemsTuple_(),
+        itemsTupleCount_(),
+        minItems_(),
+        maxItems_(SizeType(~0)),
+        additionalItems_(true),
+        uniqueItems_(false),
+        pattern_(),
+        minLength_(0),
+        maxLength_(~SizeType(0)),
+        exclusiveMinimum_(false),
+        exclusiveMaximum_(false),
+        defaultValueLength_(0),
+        readOnly_(false),
+        writeOnly_(false),
+        nullable_(false)
+    {
+        GenericStringBuffer<EncodingType> sb;
+        p.StringifyUriFragment(sb);
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::Schema", sb.GetString(), id.GetString());
+
+        typedef typename ValueType::ConstValueIterator ConstValueIterator;
+        typedef typename ValueType::ConstMemberIterator ConstMemberIterator;
+
+        // PR #1393
+        // Early add this Schema and its $ref(s) in schemaDocument's map to avoid infinite
+        // recursion (with recursive schemas), since schemaDocument->getSchema() is always
+        // checked before creating a new one. Don't cache typeless_, though.
+        if (this != typeless_) {
+          typedef typename SchemaDocumentType::SchemaEntry SchemaEntry;
+          SchemaEntry *entry = schemaDocument->schemaMap_.template Push<SchemaEntry>();
+          new (entry) SchemaEntry(pointer_, this, true, allocator_);
+          schemaDocument->AddSchemaRefs(this);
+        }
+
+        if (!value.IsObject())
+            return;
+
+        // If we have an id property, resolve it with the in-scope id
+        // Not supported for open api 2.0 or 3.0
+        if (spec_.oapi != kVersion20 && spec_.oapi != kVersion30)
+        if (const ValueType* v = GetMember(value, GetIdString())) {
+            if (v->IsString()) {
+                UriType local(*v, allocator);
+                id_ = local.Resolve(id_, allocator);
+                    RAPIDJSON_SCHEMA_PRINT(SchemaIds, id.GetString(), v->GetString(), id_.GetString());
+            }
+        }
+
+        if (const ValueType* v = GetMember(value, GetTypeString())) {
+            type_ = 0;
+            if (v->IsString())
+                AddType(*v);
+            else if (v->IsArray())
+                for (ConstValueIterator itr = v->Begin(); itr != v->End(); ++itr)
+                    AddType(*itr);
+        }
+
+        if (const ValueType* v = GetMember(value, GetEnumString())) {
+            if (v->IsArray() && v->Size() > 0) {
+                enum_ = static_cast<uint64_t*>(allocator_->Malloc(sizeof(uint64_t) * v->Size()));
+                for (ConstValueIterator itr = v->Begin(); itr != v->End(); ++itr) {
+                    typedef Hasher<EncodingType, MemoryPoolAllocator<AllocatorType> > EnumHasherType;
+                    char buffer[256u + 24];
+                    MemoryPoolAllocator<AllocatorType> hasherAllocator(buffer, sizeof(buffer));
+                    EnumHasherType h(&hasherAllocator, 256);
+                    itr->Accept(h);
+                    enum_[enumCount_++] = h.GetHashCode();
+                }
+            }
+        }
+
+        if (schemaDocument)
+            AssignIfExist(allOf_, *schemaDocument, p, value, GetAllOfString(), document);
+
+        // AnyOf, OneOf, Not not supported for open api 2.0
+        if (schemaDocument && spec_.oapi != kVersion20) {
+            AssignIfExist(anyOf_, *schemaDocument, p, value, GetAnyOfString(), document);
+            AssignIfExist(oneOf_, *schemaDocument, p, value, GetOneOfString(), document);
+
+            if (const ValueType* v = GetMember(value, GetNotString())) {
+                schemaDocument->CreateSchema(&not_, p.Append(GetNotString(), allocator_), *v, document, id_);
+                notValidatorIndex_ = validatorCount_;
+                validatorCount_++;
+            }
+        }
+
+        // Object
+
+        const ValueType* properties = GetMember(value, GetPropertiesString());
+        const ValueType* required = GetMember(value, GetRequiredString());
+        const ValueType* dependencies = GetMember(value, GetDependenciesString());
+        {
+            // Gather properties from properties/required/dependencies
+            SValue allProperties(kArrayType);
+
+            if (properties && properties->IsObject())
+                for (ConstMemberIterator itr = properties->MemberBegin(); itr != properties->MemberEnd(); ++itr)
+                    AddUniqueElement(allProperties, itr->name);
+
+            if (required && required->IsArray())
+                for (ConstValueIterator itr = required->Begin(); itr != required->End(); ++itr)
+                    if (itr->IsString())
+                        AddUniqueElement(allProperties, *itr);
+
+            // Dependencies not supported for open api 2.0 and 3.0
+            if (spec_.oapi != kVersion20 && spec_.oapi != kVersion30)
+            if (dependencies && dependencies->IsObject())
+                for (ConstMemberIterator itr = dependencies->MemberBegin(); itr != dependencies->MemberEnd(); ++itr) {
+                    AddUniqueElement(allProperties, itr->name);
+                    if (itr->value.IsArray())
+                        for (ConstValueIterator i = itr->value.Begin(); i != itr->value.End(); ++i)
+                            if (i->IsString())
+                                AddUniqueElement(allProperties, *i);
+                }
+
+            if (allProperties.Size() > 0) {
+                propertyCount_ = allProperties.Size();
+                properties_ = static_cast<Property*>(allocator_->Malloc(sizeof(Property) * propertyCount_));
+                for (SizeType i = 0; i < propertyCount_; i++) {
+                    new (&properties_[i]) Property();
+                    properties_[i].name = allProperties[i];
+                    properties_[i].schema = typeless_;
+                }
+            }
+        }
+
+        if (properties && properties->IsObject()) {
+            PointerType q = p.Append(GetPropertiesString(), allocator_);
+            for (ConstMemberIterator itr = properties->MemberBegin(); itr != properties->MemberEnd(); ++itr) {
+                SizeType index;
+                if (FindPropertyIndex(itr->name, &index))
+                    schemaDocument->CreateSchema(&properties_[index].schema, q.Append(itr->name, allocator_), itr->value, document, id_);
+            }
+        }
+
+        // PatternProperties not supported for open api 2.0 and 3.0
+        if (spec_.oapi != kVersion20 && spec_.oapi != kVersion30)
+        if (const ValueType* v = GetMember(value, GetPatternPropertiesString())) {
+            PointerType q = p.Append(GetPatternPropertiesString(), allocator_);
+            patternProperties_ = static_cast<PatternProperty*>(allocator_->Malloc(sizeof(PatternProperty) * v->MemberCount()));
+            patternPropertyCount_ = 0;
+
+            for (ConstMemberIterator itr = v->MemberBegin(); itr != v->MemberEnd(); ++itr) {
+                new (&patternProperties_[patternPropertyCount_]) PatternProperty();
+                PointerType r = q.Append(itr->name, allocator_);
+                patternProperties_[patternPropertyCount_].pattern = CreatePattern(itr->name, schemaDocument, r);
+                schemaDocument->CreateSchema(&patternProperties_[patternPropertyCount_].schema, r, itr->value, document, id_);
+                patternPropertyCount_++;
+            }
+        }
+
+        if (required && required->IsArray())
+            for (ConstValueIterator itr = required->Begin(); itr != required->End(); ++itr)
+                if (itr->IsString()) {
+                    SizeType index;
+                    if (FindPropertyIndex(*itr, &index)) {
+                        properties_[index].required = true;
+                        hasRequired_ = true;
+                    }
+                }
+
+        // Dependencies not supported for open api 2.0 and 3.0
+        if (spec_.oapi != kVersion20 && spec_.oapi != kVersion30)
+        if (dependencies && dependencies->IsObject()) {
+            PointerType q = p.Append(GetDependenciesString(), allocator_);
+            hasDependencies_ = true;
+            for (ConstMemberIterator itr = dependencies->MemberBegin(); itr != dependencies->MemberEnd(); ++itr) {
+                SizeType sourceIndex;
+                if (FindPropertyIndex(itr->name, &sourceIndex)) {
+                    if (itr->value.IsArray()) {
+                        properties_[sourceIndex].dependencies = static_cast<bool*>(allocator_->Malloc(sizeof(bool) * propertyCount_));
+                        std::memset(properties_[sourceIndex].dependencies, 0, sizeof(bool)* propertyCount_);
+                        for (ConstValueIterator targetItr = itr->value.Begin(); targetItr != itr->value.End(); ++targetItr) {
+                            SizeType targetIndex;
+                            if (FindPropertyIndex(*targetItr, &targetIndex))
+                                properties_[sourceIndex].dependencies[targetIndex] = true;
+                        }
+                    }
+                    else if (itr->value.IsObject()) {
+                        hasSchemaDependencies_ = true;
+                        schemaDocument->CreateSchema(&properties_[sourceIndex].dependenciesSchema, q.Append(itr->name, allocator_), itr->value, document, id_);
+                        properties_[sourceIndex].dependenciesValidatorIndex = validatorCount_;
+                        validatorCount_++;
+                    }
+                }
+            }
+        }
+
+        if (const ValueType* v = GetMember(value, GetAdditionalPropertiesString())) {
+            if (v->IsBool())
+                additionalProperties_ = v->GetBool();
+            else if (v->IsObject())
+                schemaDocument->CreateSchema(&additionalPropertiesSchema_, p.Append(GetAdditionalPropertiesString(), allocator_), *v, document, id_);
+        }
+
+        AssignIfExist(minProperties_, value, GetMinPropertiesString());
+        AssignIfExist(maxProperties_, value, GetMaxPropertiesString());
+
+        // Array
+        if (const ValueType* v = GetMember(value, GetItemsString())) {
+            PointerType q = p.Append(GetItemsString(), allocator_);
+            if (v->IsObject()) // List validation
+                schemaDocument->CreateSchema(&itemsList_, q, *v, document, id_);
+            else if (v->IsArray()) { // Tuple validation
+                itemsTuple_ = static_cast<const Schema**>(allocator_->Malloc(sizeof(const Schema*) * v->Size()));
+                SizeType index = 0;
+                for (ConstValueIterator itr = v->Begin(); itr != v->End(); ++itr, index++)
+                    schemaDocument->CreateSchema(&itemsTuple_[itemsTupleCount_++], q.Append(index, allocator_), *itr, document, id_);
+            }
+        }
+
+        AssignIfExist(minItems_, value, GetMinItemsString());
+        AssignIfExist(maxItems_, value, GetMaxItemsString());
+
+        // AdditionalItems not supported for openapi 2.0 and 3.0
+        if (spec_.oapi != kVersion20 && spec_.oapi != kVersion30)
+        if (const ValueType* v = GetMember(value, GetAdditionalItemsString())) {
+            if (v->IsBool())
+                additionalItems_ = v->GetBool();
+            else if (v->IsObject())
+                schemaDocument->CreateSchema(&additionalItemsSchema_, p.Append(GetAdditionalItemsString(), allocator_), *v, document, id_);
+        }
+
+        AssignIfExist(uniqueItems_, value, GetUniqueItemsString());
+
+        // String
+        AssignIfExist(minLength_, value, GetMinLengthString());
+        AssignIfExist(maxLength_, value, GetMaxLengthString());
+
+        if (const ValueType* v = GetMember(value, GetPatternString()))
+            pattern_ = CreatePattern(*v, schemaDocument, p.Append(GetPatternString(), allocator_));
+
+        // Number
+        if (const ValueType* v = GetMember(value, GetMinimumString()))
+            if (v->IsNumber())
+                minimum_.CopyFrom(*v, *allocator_);
+
+        if (const ValueType* v = GetMember(value, GetMaximumString()))
+            if (v->IsNumber())
+                maximum_.CopyFrom(*v, *allocator_);
+
+        AssignIfExist(exclusiveMinimum_, value, GetExclusiveMinimumString());
+        AssignIfExist(exclusiveMaximum_, value, GetExclusiveMaximumString());
+
+        if (const ValueType* v = GetMember(value, GetMultipleOfString()))
+            if (v->IsNumber() && v->GetDouble() > 0.0)
+                multipleOf_.CopyFrom(*v, *allocator_);
+
+        // Default
+        if (const ValueType* v = GetMember(value, GetDefaultValueString()))
+            if (v->IsString())
+                defaultValueLength_ = v->GetStringLength();
+
+        // ReadOnly - open api only (until draft 7 supported)
+        // WriteOnly - open api 3 only (until draft 7 supported)
+        // Both can't be true
+        if (spec_.oapi != kVersionNone)
+            AssignIfExist(readOnly_, value, GetReadOnlyString());
+        if (spec_.oapi >= kVersion30)
+            AssignIfExist(writeOnly_, value, GetWriteOnlyString());
+        if (readOnly_ && writeOnly_)
+            schemaDocument->SchemaError(kSchemaErrorReadOnlyAndWriteOnly, p);
+
+        // Nullable - open api 3 only
+        // If true add 'null' as allowable type
+        if (spec_.oapi >= kVersion30) {
+            AssignIfExist(nullable_, value, GetNullableString());
+            if (nullable_)
+                AddType(GetNullString());
+        }
+    }
+
+    ~Schema() {
+        AllocatorType::Free(enum_);
+        if (properties_) {
+            for (SizeType i = 0; i < propertyCount_; i++)
+                properties_[i].~Property();
+            AllocatorType::Free(properties_);
+        }
+        if (patternProperties_) {
+            for (SizeType i = 0; i < patternPropertyCount_; i++)
+                patternProperties_[i].~PatternProperty();
+            AllocatorType::Free(patternProperties_);
+        }
+        AllocatorType::Free(itemsTuple_);
+#if RAPIDJSON_SCHEMA_HAS_REGEX
+        if (pattern_) {
+            pattern_->~RegexType();
+            AllocatorType::Free(pattern_);
+        }
+#endif
+    }
+
+    const SValue& GetURI() const {
+        return uri_;
+    }
+
+    const UriType& GetId() const {
+        return id_;
+    }
+
+    const Specification& GetSpecification() const {
+        return spec_;
+    }
+
+    const PointerType& GetPointer() const {
+        return pointer_;
+    }
+
+    bool BeginValue(Context& context) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::BeginValue");
+        if (context.inArray) {
+            if (uniqueItems_)
+                context.valueUniqueness = true;
+
+            if (itemsList_)
+                context.valueSchema = itemsList_;
+            else if (itemsTuple_) {
+                if (context.arrayElementIndex < itemsTupleCount_)
+                    context.valueSchema = itemsTuple_[context.arrayElementIndex];
+                else if (additionalItemsSchema_)
+                    context.valueSchema = additionalItemsSchema_;
+                else if (additionalItems_)
+                    context.valueSchema = typeless_;
+                else {
+                    context.error_handler.DisallowedItem(context.arrayElementIndex);
+                    // Must set valueSchema for when kValidateContinueOnErrorFlag is set, else reports spurious type error
+                    context.valueSchema = typeless_;
+                    // Must bump arrayElementIndex for when kValidateContinueOnErrorFlag is set
+                    context.arrayElementIndex++;
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorAdditionalItems);
+                }
+            }
+            else
+                context.valueSchema = typeless_;
+
+            context.arrayElementIndex++;
+        }
+        return true;
+    }
+
+    RAPIDJSON_FORCEINLINE bool EndValue(Context& context) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::EndValue");
+        // Only check pattern properties if we have validators
+        if (context.patternPropertiesValidatorCount > 0) {
+            bool otherValid = false;
+            SizeType count = context.patternPropertiesValidatorCount;
+            if (context.objectPatternValidatorType != Context::kPatternValidatorOnly)
+                otherValid = context.patternPropertiesValidators[--count]->IsValid();
+
+            bool patternValid = true;
+            for (SizeType i = 0; i < count; i++)
+                if (!context.patternPropertiesValidators[i]->IsValid()) {
+                    patternValid = false;
+                    break;
+                }
+
+            if (context.objectPatternValidatorType == Context::kPatternValidatorOnly) {
+                if (!patternValid) {
+                    context.error_handler.PropertyViolations(context.patternPropertiesValidators, count);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorPatternProperties);
+                }
+            }
+            else if (context.objectPatternValidatorType == Context::kPatternValidatorWithProperty) {
+                if (!patternValid || !otherValid) {
+                    context.error_handler.PropertyViolations(context.patternPropertiesValidators, count + 1);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorPatternProperties);
+                }
+            }
+            else if (!patternValid && !otherValid) { // kPatternValidatorWithAdditionalProperty)
+                context.error_handler.PropertyViolations(context.patternPropertiesValidators, count + 1);
+                RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorPatternProperties);
+            }
+        }
+
+        // For enums only check if we have a hasher
+        if (enum_ && context.hasher) {
+            const uint64_t h = context.factory.GetHashCode(context.hasher);
+            for (SizeType i = 0; i < enumCount_; i++)
+                if (enum_[i] == h)
+                    goto foundEnum;
+            context.error_handler.DisallowedValue(kValidateErrorEnum);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorEnum);
+            foundEnum:;
+        }
+
+        // Only check allOf etc if we have validators
+        if (context.validatorCount > 0) {
+            if (allOf_.schemas)
+                for (SizeType i = allOf_.begin; i < allOf_.begin + allOf_.count; i++)
+                    if (!context.validators[i]->IsValid()) {
+                        context.error_handler.NotAllOf(&context.validators[allOf_.begin], allOf_.count);
+                        RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorAllOf);
+                    }
+
+            if (anyOf_.schemas) {
+                for (SizeType i = anyOf_.begin; i < anyOf_.begin + anyOf_.count; i++)
+                    if (context.validators[i]->IsValid())
+                        goto foundAny;
+                context.error_handler.NoneOf(&context.validators[anyOf_.begin], anyOf_.count);
+                RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorAnyOf);
+                foundAny:;
+            }
+
+            if (oneOf_.schemas) {
+                bool oneValid = false;
+                SizeType firstMatch = 0;
+                for (SizeType i = oneOf_.begin; i < oneOf_.begin + oneOf_.count; i++)
+                    if (context.validators[i]->IsValid()) {
+                        if (oneValid) {
+                            context.error_handler.MultipleOneOf(firstMatch, i - oneOf_.begin);
+                            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorOneOfMatch);
+                        } else {
+                            oneValid = true;
+                            firstMatch = i - oneOf_.begin;
+                        }
+                    }
+                if (!oneValid) {
+                    context.error_handler.NotOneOf(&context.validators[oneOf_.begin], oneOf_.count);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorOneOf);
+                }
+            }
+
+            if (not_ && context.validators[notValidatorIndex_]->IsValid()) {
+                context.error_handler.Disallowed();
+                RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorNot);
+            }
+        }
+
+        return true;
+    }
+
+    bool Null(Context& context) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::Null");
+        if (!(type_ & (1 << kNullSchemaType))) {
+            DisallowedType(context, GetNullString());
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorType);
+        }
+        return CreateParallelValidator(context);
+    }
+
+    bool Bool(Context& context, bool b) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::Bool", b);
+        if (!CheckBool(context, b))
+            return false;
+        return CreateParallelValidator(context);
+    }
+
+    bool Int(Context& context, int i) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::Int", (int64_t)i);
+        if (!CheckInt(context, i))
+            return false;
+        return CreateParallelValidator(context);
+    }
+
+    bool Uint(Context& context, unsigned u) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::Uint", (uint64_t)u);
+        if (!CheckUint(context, u))
+            return false;
+        return CreateParallelValidator(context);
+    }
+
+    bool Int64(Context& context, int64_t i) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::Int64", i);
+        if (!CheckInt(context, i))
+            return false;
+        return CreateParallelValidator(context);
+    }
+
+    bool Uint64(Context& context, uint64_t u) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::Uint64", u);
+        if (!CheckUint(context, u))
+            return false;
+        return CreateParallelValidator(context);
+    }
+
+    bool Double(Context& context, double d) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::Double", d);
+        if (!(type_ & (1 << kNumberSchemaType))) {
+            DisallowedType(context, GetNumberString());
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorType);
+        }
+
+        if (!minimum_.IsNull() && !CheckDoubleMinimum(context, d))
+            return false;
+
+        if (!maximum_.IsNull() && !CheckDoubleMaximum(context, d))
+            return false;
+
+        if (!multipleOf_.IsNull() && !CheckDoubleMultipleOf(context, d))
+            return false;
+
+        return CreateParallelValidator(context);
+    }
+
+    bool String(Context& context, const Ch* str, SizeType length, bool) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::String", str);
+        if (!(type_ & (1 << kStringSchemaType))) {
+            DisallowedType(context, GetStringString());
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorType);
+        }
+
+        if (minLength_ != 0 || maxLength_ != SizeType(~0)) {
+            SizeType count;
+            if (internal::CountStringCodePoint<EncodingType>(str, length, &count)) {
+                if (count < minLength_) {
+                    context.error_handler.TooShort(str, length, minLength_);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorMinLength);
+                }
+                if (count > maxLength_) {
+                    context.error_handler.TooLong(str, length, maxLength_);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorMaxLength);
+                }
+            }
+        }
+
+        if (pattern_ && !IsPatternMatch(pattern_, str, length)) {
+            context.error_handler.DoesNotMatch(str, length);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorPattern);
+        }
+
+        return CreateParallelValidator(context);
+    }
+
+    bool StartObject(Context& context) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::StartObject");
+        if (!(type_ & (1 << kObjectSchemaType))) {
+            DisallowedType(context, GetObjectString());
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorType);
+        }
+
+        if (hasDependencies_ || hasRequired_) {
+            context.propertyExist = static_cast<bool*>(context.factory.MallocState(sizeof(bool) * propertyCount_));
+            std::memset(context.propertyExist, 0, sizeof(bool) * propertyCount_);
+        }
+
+        if (patternProperties_) { // pre-allocate schema array
+            SizeType count = patternPropertyCount_ + 1; // extra for valuePatternValidatorType
+            context.patternPropertiesSchemas = static_cast<const SchemaType**>(context.factory.MallocState(sizeof(const SchemaType*) * count));
+            context.patternPropertiesSchemaCount = 0;
+            std::memset(context.patternPropertiesSchemas, 0, sizeof(SchemaType*) * count);
+        }
+
+        return CreateParallelValidator(context);
+    }
+
+    bool Key(Context& context, const Ch* str, SizeType len, bool) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::Key", str);
+
+        if (patternProperties_) {
+            context.patternPropertiesSchemaCount = 0;
+            for (SizeType i = 0; i < patternPropertyCount_; i++)
+                if (patternProperties_[i].pattern && IsPatternMatch(patternProperties_[i].pattern, str, len)) {
+                    context.patternPropertiesSchemas[context.patternPropertiesSchemaCount++] = patternProperties_[i].schema;
+                    context.valueSchema = typeless_;
+                }
+        }
+
+        SizeType index  = 0;
+        if (FindPropertyIndex(ValueType(str, len).Move(), &index)) {
+            if (context.patternPropertiesSchemaCount > 0) {
+                context.patternPropertiesSchemas[context.patternPropertiesSchemaCount++] = properties_[index].schema;
+                context.valueSchema = typeless_;
+                context.valuePatternValidatorType = Context::kPatternValidatorWithProperty;
+            }
+            else
+                context.valueSchema = properties_[index].schema;
+
+            if (context.propertyExist)
+                context.propertyExist[index] = true;
+
+            return true;
+        }
+
+        if (additionalPropertiesSchema_) {
+            if (context.patternPropertiesSchemaCount > 0) {
+                context.patternPropertiesSchemas[context.patternPropertiesSchemaCount++] = additionalPropertiesSchema_;
+                context.valueSchema = typeless_;
+                context.valuePatternValidatorType = Context::kPatternValidatorWithAdditionalProperty;
+            }
+            else
+                context.valueSchema = additionalPropertiesSchema_;
+            return true;
+        }
+        else if (additionalProperties_) {
+            context.valueSchema = typeless_;
+            return true;
+        }
+
+        if (context.patternPropertiesSchemaCount == 0) { // patternProperties are not additional properties
+            // Must set valueSchema for when kValidateContinueOnErrorFlag is set, else reports spurious type error
+            context.valueSchema = typeless_;
+            context.error_handler.DisallowedProperty(str, len);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorAdditionalProperties);
+        }
+
+        return true;
+    }
+
+    bool EndObject(Context& context, SizeType memberCount) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::EndObject");
+        if (hasRequired_) {
+            context.error_handler.StartMissingProperties();
+            for (SizeType index = 0; index < propertyCount_; index++)
+                if (properties_[index].required && !context.propertyExist[index])
+                    if (properties_[index].schema->defaultValueLength_ == 0 )
+                        context.error_handler.AddMissingProperty(properties_[index].name);
+            if (context.error_handler.EndMissingProperties())
+                RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorRequired);
+        }
+
+        if (memberCount < minProperties_) {
+            context.error_handler.TooFewProperties(memberCount, minProperties_);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorMinProperties);
+        }
+
+        if (memberCount > maxProperties_) {
+            context.error_handler.TooManyProperties(memberCount, maxProperties_);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorMaxProperties);
+        }
+
+        if (hasDependencies_) {
+            context.error_handler.StartDependencyErrors();
+            for (SizeType sourceIndex = 0; sourceIndex < propertyCount_; sourceIndex++) {
+                const Property& source = properties_[sourceIndex];
+                if (context.propertyExist[sourceIndex]) {
+                    if (source.dependencies) {
+                        context.error_handler.StartMissingDependentProperties();
+                        for (SizeType targetIndex = 0; targetIndex < propertyCount_; targetIndex++)
+                            if (source.dependencies[targetIndex] && !context.propertyExist[targetIndex])
+                                context.error_handler.AddMissingDependentProperty(properties_[targetIndex].name);
+                        context.error_handler.EndMissingDependentProperties(source.name);
+                    }
+                    else if (source.dependenciesSchema) {
+                        ISchemaValidator* dependenciesValidator = context.validators[source.dependenciesValidatorIndex];
+                        if (!dependenciesValidator->IsValid())
+                            context.error_handler.AddDependencySchemaError(source.name, dependenciesValidator);
+                    }
+                }
+            }
+            if (context.error_handler.EndDependencyErrors())
+                RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorDependencies);
+        }
+
+        return true;
+    }
+
+    bool StartArray(Context& context) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::StartArray");
+        context.arrayElementIndex = 0;
+        context.inArray = true;  // Ensure we note that we are in an array
+
+        if (!(type_ & (1 << kArraySchemaType))) {
+            DisallowedType(context, GetArrayString());
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorType);
+        }
+
+        return CreateParallelValidator(context);
+    }
+
+    bool EndArray(Context& context, SizeType elementCount) const {
+        RAPIDJSON_SCHEMA_PRINT(Method, "Schema::EndArray");
+        context.inArray = false;
+
+        if (elementCount < minItems_) {
+            context.error_handler.TooFewItems(elementCount, minItems_);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorMinItems);
+        }
+
+        if (elementCount > maxItems_) {
+            context.error_handler.TooManyItems(elementCount, maxItems_);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorMaxItems);
+        }
+
+        return true;
+    }
+
+    static const ValueType& GetValidateErrorKeyword(ValidateErrorCode validateErrorCode) {
+        switch (validateErrorCode) {
+            case kValidateErrorMultipleOf:              return GetMultipleOfString();
+            case kValidateErrorMaximum:                 return GetMaximumString();
+            case kValidateErrorExclusiveMaximum:        return GetMaximumString(); // Same
+            case kValidateErrorMinimum:                 return GetMinimumString();
+            case kValidateErrorExclusiveMinimum:        return GetMinimumString(); // Same
+
+            case kValidateErrorMaxLength:               return GetMaxLengthString();
+            case kValidateErrorMinLength:               return GetMinLengthString();
+            case kValidateErrorPattern:                 return GetPatternString();
+
+            case kValidateErrorMaxItems:                return GetMaxItemsString();
+            case kValidateErrorMinItems:                return GetMinItemsString();
+            case kValidateErrorUniqueItems:             return GetUniqueItemsString();
+            case kValidateErrorAdditionalItems:         return GetAdditionalItemsString();
+
+            case kValidateErrorMaxProperties:           return GetMaxPropertiesString();
+            case kValidateErrorMinProperties:           return GetMinPropertiesString();
+            case kValidateErrorRequired:                return GetRequiredString();
+            case kValidateErrorAdditionalProperties:    return GetAdditionalPropertiesString();
+            case kValidateErrorPatternProperties:       return GetPatternPropertiesString();
+            case kValidateErrorDependencies:            return GetDependenciesString();
+
+            case kValidateErrorEnum:                    return GetEnumString();
+            case kValidateErrorType:                    return GetTypeString();
+
+            case kValidateErrorOneOf:                   return GetOneOfString();
+            case kValidateErrorOneOfMatch:              return GetOneOfString(); // Same
+            case kValidateErrorAllOf:                   return GetAllOfString();
+            case kValidateErrorAnyOf:                   return GetAnyOfString();
+            case kValidateErrorNot:                     return GetNotString();
+
+            case kValidateErrorReadOnly:                return GetReadOnlyString();
+            case kValidateErrorWriteOnly:               return GetWriteOnlyString();
+
+            default:                                    return GetNullString();
+        }
+    }
+
+
+    // Generate functions for string literal according to Ch
+#define RAPIDJSON_STRING_(name, ...) \
+    static const ValueType& Get##name##String() {\
+        static const Ch s[] = { __VA_ARGS__, '\0' };\
+        static const ValueType v(s, static_cast<SizeType>(sizeof(s) / sizeof(Ch) - 1));\
+        return v;\
+    }
+
+    RAPIDJSON_STRING_(Null, 'n', 'u', 'l', 'l')
+    RAPIDJSON_STRING_(Boolean, 'b', 'o', 'o', 'l', 'e', 'a', 'n')
+    RAPIDJSON_STRING_(Object, 'o', 'b', 'j', 'e', 'c', 't')
+    RAPIDJSON_STRING_(Array, 'a', 'r', 'r', 'a', 'y')
+    RAPIDJSON_STRING_(String, 's', 't', 'r', 'i', 'n', 'g')
+    RAPIDJSON_STRING_(Number, 'n', 'u', 'm', 'b', 'e', 'r')
+    RAPIDJSON_STRING_(Integer, 'i', 'n', 't', 'e', 'g', 'e', 'r')
+    RAPIDJSON_STRING_(Type, 't', 'y', 'p', 'e')
+    RAPIDJSON_STRING_(Enum, 'e', 'n', 'u', 'm')
+    RAPIDJSON_STRING_(AllOf, 'a', 'l', 'l', 'O', 'f')
+    RAPIDJSON_STRING_(AnyOf, 'a', 'n', 'y', 'O', 'f')
+    RAPIDJSON_STRING_(OneOf, 'o', 'n', 'e', 'O', 'f')
+    RAPIDJSON_STRING_(Not, 'n', 'o', 't')
+    RAPIDJSON_STRING_(Properties, 'p', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's')
+    RAPIDJSON_STRING_(Required, 'r', 'e', 'q', 'u', 'i', 'r', 'e', 'd')
+    RAPIDJSON_STRING_(Dependencies, 'd', 'e', 'p', 'e', 'n', 'd', 'e', 'n', 'c', 'i', 'e', 's')
+    RAPIDJSON_STRING_(PatternProperties, 'p', 'a', 't', 't', 'e', 'r', 'n', 'P', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's')
+    RAPIDJSON_STRING_(AdditionalProperties, 'a', 'd', 'd', 'i', 't', 'i', 'o', 'n', 'a', 'l', 'P', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's')
+    RAPIDJSON_STRING_(MinProperties, 'm', 'i', 'n', 'P', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's')
+    RAPIDJSON_STRING_(MaxProperties, 'm', 'a', 'x', 'P', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's')
+    RAPIDJSON_STRING_(Items, 'i', 't', 'e', 'm', 's')
+    RAPIDJSON_STRING_(MinItems, 'm', 'i', 'n', 'I', 't', 'e', 'm', 's')
+    RAPIDJSON_STRING_(MaxItems, 'm', 'a', 'x', 'I', 't', 'e', 'm', 's')
+    RAPIDJSON_STRING_(AdditionalItems, 'a', 'd', 'd', 'i', 't', 'i', 'o', 'n', 'a', 'l', 'I', 't', 'e', 'm', 's')
+    RAPIDJSON_STRING_(UniqueItems, 'u', 'n', 'i', 'q', 'u', 'e', 'I', 't', 'e', 'm', 's')
+    RAPIDJSON_STRING_(MinLength, 'm', 'i', 'n', 'L', 'e', 'n', 'g', 't', 'h')
+    RAPIDJSON_STRING_(MaxLength, 'm', 'a', 'x', 'L', 'e', 'n', 'g', 't', 'h')
+    RAPIDJSON_STRING_(Pattern, 'p', 'a', 't', 't', 'e', 'r', 'n')
+    RAPIDJSON_STRING_(Minimum, 'm', 'i', 'n', 'i', 'm', 'u', 'm')
+    RAPIDJSON_STRING_(Maximum, 'm', 'a', 'x', 'i', 'm', 'u', 'm')
+    RAPIDJSON_STRING_(ExclusiveMinimum, 'e', 'x', 'c', 'l', 'u', 's', 'i', 'v', 'e', 'M', 'i', 'n', 'i', 'm', 'u', 'm')
+    RAPIDJSON_STRING_(ExclusiveMaximum, 'e', 'x', 'c', 'l', 'u', 's', 'i', 'v', 'e', 'M', 'a', 'x', 'i', 'm', 'u', 'm')
+    RAPIDJSON_STRING_(MultipleOf, 'm', 'u', 'l', 't', 'i', 'p', 'l', 'e', 'O', 'f')
+    RAPIDJSON_STRING_(DefaultValue, 'd', 'e', 'f', 'a', 'u', 'l', 't')
+    RAPIDJSON_STRING_(Schema, '$', 's', 'c', 'h', 'e', 'm', 'a')
+    RAPIDJSON_STRING_(Ref, '$', 'r', 'e', 'f')
+    RAPIDJSON_STRING_(Id, 'i', 'd')
+    RAPIDJSON_STRING_(Swagger, 's', 'w', 'a', 'g', 'g', 'e', 'r')
+    RAPIDJSON_STRING_(OpenApi, 'o', 'p', 'e', 'n', 'a', 'p', 'i')
+    RAPIDJSON_STRING_(ReadOnly, 'r', 'e', 'a', 'd', 'O', 'n', 'l', 'y')
+    RAPIDJSON_STRING_(WriteOnly, 'w', 'r', 'i', 't', 'e', 'O', 'n', 'l', 'y')
+    RAPIDJSON_STRING_(Nullable, 'n', 'u', 'l', 'l', 'a', 'b', 'l', 'e')
+
+#undef RAPIDJSON_STRING_
+
+private:
+    enum SchemaValueType {
+        kNullSchemaType,
+        kBooleanSchemaType,
+        kObjectSchemaType,
+        kArraySchemaType,
+        kStringSchemaType,
+        kNumberSchemaType,
+        kIntegerSchemaType,
+        kTotalSchemaType
+    };
+
+#if RAPIDJSON_SCHEMA_USE_INTERNALREGEX
+        typedef internal::GenericRegex<EncodingType, AllocatorType> RegexType;
+#elif RAPIDJSON_SCHEMA_USE_STDREGEX
+        typedef std::basic_regex<Ch> RegexType;
+#else
+        typedef char RegexType;
+#endif
+
+    struct SchemaArray {
+        SchemaArray() : schemas(), count() {}
+        ~SchemaArray() { AllocatorType::Free(schemas); }
+        const SchemaType** schemas;
+        SizeType begin; // begin index of context.validators
+        SizeType count;
+    };
+
+    template <typename V1, typename V2>
+    void AddUniqueElement(V1& a, const V2& v) {
+        for (typename V1::ConstValueIterator itr = a.Begin(); itr != a.End(); ++itr)
+            if (*itr == v)
+                return;
+        V1 c(v, *allocator_);
+        a.PushBack(c, *allocator_);
+    }
+
+    static const ValueType* GetMember(const ValueType& value, const ValueType& name) {
+        typename ValueType::ConstMemberIterator itr = value.FindMember(name);
+        return itr != value.MemberEnd() ? &(itr->value) : 0;
+    }
+
+    static void AssignIfExist(bool& out, const ValueType& value, const ValueType& name) {
+        if (const ValueType* v = GetMember(value, name))
+            if (v->IsBool())
+                out = v->GetBool();
+    }
+
+    static void AssignIfExist(SizeType& out, const ValueType& value, const ValueType& name) {
+        if (const ValueType* v = GetMember(value, name))
+            if (v->IsUint64() && v->GetUint64() <= SizeType(~0))
+                out = static_cast<SizeType>(v->GetUint64());
+    }
+
+    void AssignIfExist(SchemaArray& out, SchemaDocumentType& schemaDocument, const PointerType& p, const ValueType& value, const ValueType& name, const ValueType& document) {
+        if (const ValueType* v = GetMember(value, name)) {
+            if (v->IsArray() && v->Size() > 0) {
+                PointerType q = p.Append(name, allocator_);
+                out.count = v->Size();
+                out.schemas = static_cast<const Schema**>(allocator_->Malloc(out.count * sizeof(const Schema*)));
+                memset(out.schemas, 0, sizeof(Schema*)* out.count);
+                for (SizeType i = 0; i < out.count; i++)
+                    schemaDocument.CreateSchema(&out.schemas[i], q.Append(i, allocator_), (*v)[i], document, id_);
+                out.begin = validatorCount_;
+                validatorCount_ += out.count;
+            }
+        }
+    }
+
+#if RAPIDJSON_SCHEMA_USE_INTERNALREGEX
+    template <typename ValueType>
+    RegexType* CreatePattern(const ValueType& value, SchemaDocumentType* sd, const PointerType& p) {
+        if (value.IsString()) {
+            RegexType* r = new (allocator_->Malloc(sizeof(RegexType))) RegexType(value.GetString(), allocator_);
+            if (!r->IsValid()) {
+                sd->SchemaErrorValue(kSchemaErrorRegexInvalid, p, value.GetString(), value.GetStringLength());
+                r->~RegexType();
+                AllocatorType::Free(r);
+                r = 0;
+            }
+            return r;
+        }
+        return 0;
+    }
+
+    static bool IsPatternMatch(const RegexType* pattern, const Ch *str, SizeType) {
+        GenericRegexSearch<RegexType> rs(*pattern);
+        return rs.Search(str);
+    }
+#elif RAPIDJSON_SCHEMA_USE_STDREGEX
+    template <typename ValueType>
+    RegexType* CreatePattern(const ValueType& value, SchemaDocumentType* sd, const PointerType& p) {
+        if (value.IsString()) {
+            RegexType *r = static_cast<RegexType*>(allocator_->Malloc(sizeof(RegexType)));
+            try {
+                return new (r) RegexType(value.GetString(), std::size_t(value.GetStringLength()), std::regex_constants::ECMAScript);
+            }
+            catch (const std::regex_error& e) {
+                sd->SchemaErrorValue(kSchemaErrorRegexInvalid, p, value.GetString(), value.GetStringLength());
+                AllocatorType::Free(r);
+            }
+        }
+        return 0;
+    }
+
+    static bool IsPatternMatch(const RegexType* pattern, const Ch *str, SizeType length) {
+        std::match_results<const Ch*> r;
+        return std::regex_search(str, str + length, r, *pattern);
+    }
+#else
+    template <typename ValueType>
+    RegexType* CreatePattern(const ValueType&) {
+        return 0;
+    }
+
+    static bool IsPatternMatch(const RegexType*, const Ch *, SizeType) { return true; }
+#endif // RAPIDJSON_SCHEMA_USE_STDREGEX
+
+    void AddType(const ValueType& type) {
+        if      (type == GetNullString()   ) type_ |= 1 << kNullSchemaType;
+        else if (type == GetBooleanString()) type_ |= 1 << kBooleanSchemaType;
+        else if (type == GetObjectString() ) type_ |= 1 << kObjectSchemaType;
+        else if (type == GetArrayString()  ) type_ |= 1 << kArraySchemaType;
+        else if (type == GetStringString() ) type_ |= 1 << kStringSchemaType;
+        else if (type == GetIntegerString()) type_ |= 1 << kIntegerSchemaType;
+        else if (type == GetNumberString() ) type_ |= (1 << kNumberSchemaType) | (1 << kIntegerSchemaType);
+    }
+
+    // Creates parallel validators for allOf, anyOf, oneOf, not and schema dependencies, if required.
+    // Also creates a hasher for enums and array uniqueness, if required.
+    // Also a useful place to add type-independent error checks.
+    bool CreateParallelValidator(Context& context) const {
+        if (enum_ || context.arrayUniqueness)
+            context.hasher = context.factory.CreateHasher();
+
+        if (validatorCount_) {
+            RAPIDJSON_ASSERT(context.validators == 0);
+            context.validators = static_cast<ISchemaValidator**>(context.factory.MallocState(sizeof(ISchemaValidator*) * validatorCount_));
+            std::memset(context.validators, 0, sizeof(ISchemaValidator*) * validatorCount_);
+            context.validatorCount = validatorCount_;
+
+            // Always return after first failure for these sub-validators
+            if (allOf_.schemas)
+                CreateSchemaValidators(context, allOf_, false);
+
+            if (anyOf_.schemas)
+                CreateSchemaValidators(context, anyOf_, false);
+
+            if (oneOf_.schemas)
+                CreateSchemaValidators(context, oneOf_, false);
+
+            if (not_)
+                context.validators[notValidatorIndex_] = context.factory.CreateSchemaValidator(*not_, false);
+
+            if (hasSchemaDependencies_) {
+                for (SizeType i = 0; i < propertyCount_; i++)
+                    if (properties_[i].dependenciesSchema)
+                        context.validators[properties_[i].dependenciesValidatorIndex] = context.factory.CreateSchemaValidator(*properties_[i].dependenciesSchema, false);
+            }
+        }
+
+        // Add any other type-independent checks here
+        if (readOnly_ && (context.flags & kValidateWriteFlag)) {
+            context.error_handler.DisallowedWhenWriting();
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorReadOnly);
+        }
+        if (writeOnly_ && (context.flags & kValidateReadFlag)) {
+            context.error_handler.DisallowedWhenReading();
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorWriteOnly);
+        }
+
+        return true;
+    }
+
+    void CreateSchemaValidators(Context& context, const SchemaArray& schemas, const bool inheritContinueOnErrors) const {
+        for (SizeType i = 0; i < schemas.count; i++)
+            context.validators[schemas.begin + i] = context.factory.CreateSchemaValidator(*schemas.schemas[i], inheritContinueOnErrors);
+    }
+
+    // O(n)
+    bool FindPropertyIndex(const ValueType& name, SizeType* outIndex) const {
+        SizeType len = name.GetStringLength();
+        const Ch* str = name.GetString();
+        for (SizeType index = 0; index < propertyCount_; index++)
+            if (properties_[index].name.GetStringLength() == len &&
+                (std::memcmp(properties_[index].name.GetString(), str, sizeof(Ch) * len) == 0))
+            {
+                *outIndex = index;
+                return true;
+            }
+        return false;
+    }
+
+    bool CheckBool(Context& context, bool) const {
+        if (!(type_ & (1 << kBooleanSchemaType))) {
+            DisallowedType(context, GetBooleanString());
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorType);
+        }
+        return true;
+    }
+
+    bool CheckInt(Context& context, int64_t i) const {
+        if (!(type_ & ((1 << kIntegerSchemaType) | (1 << kNumberSchemaType)))) {
+            DisallowedType(context, GetIntegerString());
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorType);
+        }
+
+        if (!minimum_.IsNull()) {
+            if (minimum_.IsInt64()) {
+                if (exclusiveMinimum_ ? i <= minimum_.GetInt64() : i < minimum_.GetInt64()) {
+                    context.error_handler.BelowMinimum(i, minimum_, exclusiveMinimum_);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(exclusiveMinimum_ ? kValidateErrorExclusiveMinimum : kValidateErrorMinimum);
+                }
+            }
+            else if (minimum_.IsUint64()) {
+                context.error_handler.BelowMinimum(i, minimum_, exclusiveMinimum_);
+                RAPIDJSON_INVALID_KEYWORD_RETURN(exclusiveMinimum_ ? kValidateErrorExclusiveMinimum : kValidateErrorMinimum); // i <= max(int64_t) < minimum.GetUint64()
+            }
+            else if (!CheckDoubleMinimum(context, static_cast<double>(i)))
+                return false;
+        }
+
+        if (!maximum_.IsNull()) {
+            if (maximum_.IsInt64()) {
+                if (exclusiveMaximum_ ? i >= maximum_.GetInt64() : i > maximum_.GetInt64()) {
+                    context.error_handler.AboveMaximum(i, maximum_, exclusiveMaximum_);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(exclusiveMaximum_ ? kValidateErrorExclusiveMaximum : kValidateErrorMaximum);
+                }
+            }
+            else if (maximum_.IsUint64()) { }
+                /* do nothing */ // i <= max(int64_t) < maximum_.GetUint64()
+            else if (!CheckDoubleMaximum(context, static_cast<double>(i)))
+                return false;
+        }
+
+        if (!multipleOf_.IsNull()) {
+            if (multipleOf_.IsUint64()) {
+                if (static_cast<uint64_t>(i >= 0 ? i : -i) % multipleOf_.GetUint64() != 0) {
+                    context.error_handler.NotMultipleOf(i, multipleOf_);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorMultipleOf);
+                }
+            }
+            else if (!CheckDoubleMultipleOf(context, static_cast<double>(i)))
+                return false;
+        }
+
+        return true;
+    }
+
+    bool CheckUint(Context& context, uint64_t i) const {
+        if (!(type_ & ((1 << kIntegerSchemaType) | (1 << kNumberSchemaType)))) {
+            DisallowedType(context, GetIntegerString());
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorType);
+        }
+
+        if (!minimum_.IsNull()) {
+            if (minimum_.IsUint64()) {
+                if (exclusiveMinimum_ ? i <= minimum_.GetUint64() : i < minimum_.GetUint64()) {
+                    context.error_handler.BelowMinimum(i, minimum_, exclusiveMinimum_);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(exclusiveMinimum_ ? kValidateErrorExclusiveMinimum : kValidateErrorMinimum);
+                }
+            }
+            else if (minimum_.IsInt64())
+                /* do nothing */; // i >= 0 > minimum.Getint64()
+            else if (!CheckDoubleMinimum(context, static_cast<double>(i)))
+                return false;
+        }
+
+        if (!maximum_.IsNull()) {
+            if (maximum_.IsUint64()) {
+                if (exclusiveMaximum_ ? i >= maximum_.GetUint64() : i > maximum_.GetUint64()) {
+                    context.error_handler.AboveMaximum(i, maximum_, exclusiveMaximum_);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(exclusiveMaximum_ ? kValidateErrorExclusiveMaximum : kValidateErrorMaximum);
+                }
+            }
+            else if (maximum_.IsInt64()) {
+                context.error_handler.AboveMaximum(i, maximum_, exclusiveMaximum_);
+                RAPIDJSON_INVALID_KEYWORD_RETURN(exclusiveMaximum_ ? kValidateErrorExclusiveMaximum : kValidateErrorMaximum); // i >= 0 > maximum_
+            }
+            else if (!CheckDoubleMaximum(context, static_cast<double>(i)))
+                return false;
+        }
+
+        if (!multipleOf_.IsNull()) {
+            if (multipleOf_.IsUint64()) {
+                if (i % multipleOf_.GetUint64() != 0) {
+                    context.error_handler.NotMultipleOf(i, multipleOf_);
+                    RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorMultipleOf);
+                }
+            }
+            else if (!CheckDoubleMultipleOf(context, static_cast<double>(i)))
+                return false;
+        }
+
+        return true;
+    }
+
+    bool CheckDoubleMinimum(Context& context, double d) const {
+        if (exclusiveMinimum_ ? d <= minimum_.GetDouble() : d < minimum_.GetDouble()) {
+            context.error_handler.BelowMinimum(d, minimum_, exclusiveMinimum_);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(exclusiveMinimum_ ? kValidateErrorExclusiveMinimum : kValidateErrorMinimum);
+        }
+        return true;
+    }
+
+    bool CheckDoubleMaximum(Context& context, double d) const {
+        if (exclusiveMaximum_ ? d >= maximum_.GetDouble() : d > maximum_.GetDouble()) {
+            context.error_handler.AboveMaximum(d, maximum_, exclusiveMaximum_);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(exclusiveMaximum_ ? kValidateErrorExclusiveMaximum : kValidateErrorMaximum);
+        }
+        return true;
+    }
+
+    bool CheckDoubleMultipleOf(Context& context, double d) const {
+        double a = std::abs(d), b = std::abs(multipleOf_.GetDouble());
+        double q = a / b;
+        double qRounded = std::floor(q + 0.5);
+        double scaledEpsilon = (q + qRounded) * std::numeric_limits<double>::epsilon();
+        double difference = std::abs(qRounded - q);
+        bool isMultiple = difference <= scaledEpsilon || difference < (std::numeric_limits<double>::min)();
+        if (!isMultiple) {
+            context.error_handler.NotMultipleOf(d, multipleOf_);
+            RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorMultipleOf);
+        }
+        return true;
+    }
+
+    void DisallowedType(Context& context, const ValueType& actualType) const {
+        ErrorHandler& eh = context.error_handler;
+        eh.StartDisallowedType();
+
+        if (type_ & (1 << kNullSchemaType)) eh.AddExpectedType(GetNullString());
+        if (type_ & (1 << kBooleanSchemaType)) eh.AddExpectedType(GetBooleanString());
+        if (type_ & (1 << kObjectSchemaType)) eh.AddExpectedType(GetObjectString());
+        if (type_ & (1 << kArraySchemaType)) eh.AddExpectedType(GetArrayString());
+        if (type_ & (1 << kStringSchemaType)) eh.AddExpectedType(GetStringString());
+
+        if (type_ & (1 << kNumberSchemaType)) eh.AddExpectedType(GetNumberString());
+        else if (type_ & (1 << kIntegerSchemaType)) eh.AddExpectedType(GetIntegerString());
+
+        eh.EndDisallowedType(actualType);
+    }
+
+    struct Property {
+        Property() : schema(), dependenciesSchema(), dependenciesValidatorIndex(), dependencies(), required(false) {}
+        ~Property() { AllocatorType::Free(dependencies); }
+        SValue name;
+        const SchemaType* schema;
+        const SchemaType* dependenciesSchema;
+        SizeType dependenciesValidatorIndex;
+        bool* dependencies;
+        bool required;
+    };
+
+    struct PatternProperty {
+        PatternProperty() : schema(), pattern() {}
+        ~PatternProperty() {
+            if (pattern) {
+                pattern->~RegexType();
+                AllocatorType::Free(pattern);
+            }
+        }
+        const SchemaType* schema;
+        RegexType* pattern;
+    };
+
+    AllocatorType* allocator_;
+    SValue uri_;
+    UriType id_;
+    Specification spec_;
+    PointerType pointer_;
+    const SchemaType* typeless_;
+    uint64_t* enum_;
+    SizeType enumCount_;
+    SchemaArray allOf_;
+    SchemaArray anyOf_;
+    SchemaArray oneOf_;
+    const SchemaType* not_;
+    unsigned type_; // bitmask of kSchemaType
+    SizeType validatorCount_;
+    SizeType notValidatorIndex_;
+
+    Property* properties_;
+    const SchemaType* additionalPropertiesSchema_;
+    PatternProperty* patternProperties_;
+    SizeType patternPropertyCount_;
+    SizeType propertyCount_;
+    SizeType minProperties_;
+    SizeType maxProperties_;
+    bool additionalProperties_;
+    bool hasDependencies_;
+    bool hasRequired_;
+    bool hasSchemaDependencies_;
+
+    const SchemaType* additionalItemsSchema_;
+    const SchemaType* itemsList_;
+    const SchemaType** itemsTuple_;
+    SizeType itemsTupleCount_;
+    SizeType minItems_;
+    SizeType maxItems_;
+    bool additionalItems_;
+    bool uniqueItems_;
+
+    RegexType* pattern_;
+    SizeType minLength_;
+    SizeType maxLength_;
+
+    SValue minimum_;
+    SValue maximum_;
+    SValue multipleOf_;
+    bool exclusiveMinimum_;
+    bool exclusiveMaximum_;
+
+    SizeType defaultValueLength_;
+
+    bool readOnly_;
+    bool writeOnly_;
+    bool nullable_;
+};
+
+template<typename Stack, typename Ch>
+struct TokenHelper {
+    RAPIDJSON_FORCEINLINE static void AppendIndexToken(Stack& documentStack, SizeType index) {
+        *documentStack.template Push<Ch>() = '/';
+        char buffer[21];
+        size_t length = static_cast<size_t>((sizeof(SizeType) == 4 ? u32toa(index, buffer) : u64toa(index, buffer)) - buffer);
+        for (size_t i = 0; i < length; i++)
+            *documentStack.template Push<Ch>() = static_cast<Ch>(buffer[i]);
+    }
+};
+
+// Partial specialized version for char to prevent buffer copying.
+template <typename Stack>
+struct TokenHelper<Stack, char> {
+    RAPIDJSON_FORCEINLINE static void AppendIndexToken(Stack& documentStack, SizeType index) {
+        RAPIDJSON_IF_CONSTEXPR (sizeof(SizeType) == 4) {
+            char *buffer = documentStack.template Push<char>(1 + 10); // '/' + uint
+            *buffer++ = '/';
+            const char* end = internal::u32toa(index, buffer);
+             documentStack.template Pop<char>(static_cast<size_t>(10 - (end - buffer)));
+        }
+        else {
+            char *buffer = documentStack.template Push<char>(1 + 20); // '/' + uint64
+            *buffer++ = '/';
+            const char* end = internal::u64toa(index, buffer);
+            documentStack.template Pop<char>(static_cast<size_t>(20 - (end - buffer)));
+        }
+    }
+};
+
+} // namespace internal
+
+///////////////////////////////////////////////////////////////////////////////
+// IGenericRemoteSchemaDocumentProvider
+
+template <typename SchemaDocumentType>
+class IGenericRemoteSchemaDocumentProvider {
+public:
+    typedef typename SchemaDocumentType::Ch Ch;
+    typedef typename SchemaDocumentType::ValueType ValueType;
+    typedef typename SchemaDocumentType::AllocatorType AllocatorType;
+
+    virtual ~IGenericRemoteSchemaDocumentProvider() {}
+    virtual const SchemaDocumentType* GetRemoteDocument(const Ch* uri, SizeType length) = 0;
+    virtual const SchemaDocumentType* GetRemoteDocument(const GenericUri<ValueType, AllocatorType> uri, Specification& spec) {
+        // Default implementation just calls through for compatibility
+        // Following line suppresses unused parameter warning
+        (void)spec;
+        // printf("GetRemoteDocument: %d %d\n", spec.draft, spec.oapi);
+        return GetRemoteDocument(uri.GetBaseString(), uri.GetBaseStringLength());
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericSchemaDocument
+
+//! JSON schema document.
+/*!
+    A JSON schema document is a compiled version of a JSON schema.
+    It is basically a tree of internal::Schema.
+
+    \note This is an immutable class (i.e. its instance cannot be modified after construction).
+    \tparam ValueT Type of JSON value (e.g. \c Value ), which also determine the encoding.
+    \tparam Allocator Allocator type for allocating memory of this document.
+*/
+template <typename ValueT, typename Allocator = CrtAllocator>
+class GenericSchemaDocument {
+public:
+    typedef ValueT ValueType;
+    typedef IGenericRemoteSchemaDocumentProvider<GenericSchemaDocument> IRemoteSchemaDocumentProviderType;
+    typedef Allocator AllocatorType;
+    typedef typename ValueType::EncodingType EncodingType;
+    typedef typename EncodingType::Ch Ch;
+    typedef internal::Schema<GenericSchemaDocument> SchemaType;
+    typedef GenericPointer<ValueType, Allocator> PointerType;
+    typedef GenericValue<EncodingType, AllocatorType> GValue;
+    typedef GenericUri<ValueType, Allocator> UriType;
+    typedef GenericStringRef<Ch> StringRefType;
+    friend class internal::Schema<GenericSchemaDocument>;
+    template <typename, typename, typename>
+    friend class GenericSchemaValidator;
+
+    //! Constructor.
+    /*!
+        Compile a JSON document into schema document.
+
+        \param document A JSON document as source.
+        \param uri The base URI of this schema document for purposes of violation reporting.
+        \param uriLength Length of \c name, in code points.
+        \param remoteProvider An optional remote schema document provider for resolving remote reference. Can be null.
+        \param allocator An optional allocator instance for allocating memory. Can be null.
+        \param pointer An optional JSON pointer to the start of the schema document
+        \param spec Optional schema draft or OpenAPI version. Used if no specification in document. Defaults to draft-04.
+    */
+    explicit GenericSchemaDocument(const ValueType& document, const Ch* uri = 0, SizeType uriLength = 0,
+        IRemoteSchemaDocumentProviderType* remoteProvider = 0, Allocator* allocator = 0,
+        const PointerType& pointer = PointerType(), // PR #1393
+        const Specification& spec = Specification(kDraft04)) :
+        remoteProvider_(remoteProvider),
+        allocator_(allocator),
+        ownAllocator_(),
+        root_(),
+        typeless_(),
+        schemaMap_(allocator, kInitialSchemaMapSize),
+        schemaRef_(allocator, kInitialSchemaRefSize),
+        spec_(spec),
+        error_(kObjectType),
+        currentError_()
+    {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaDocument::GenericSchemaDocument");
+        if (!allocator_)
+            ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)();
+
+        Ch noUri[1] = {0};
+        uri_.SetString(uri ? uri : noUri, uriLength, *allocator_);
+        docId_ = UriType(uri_, allocator_);
+
+        typeless_ = static_cast<SchemaType*>(allocator_->Malloc(sizeof(SchemaType)));
+        new (typeless_) SchemaType(this, PointerType(), ValueType(kObjectType).Move(), ValueType(kObjectType).Move(), allocator_, docId_);
+
+        // Establish the schema draft or open api version.
+        // We only ever look for '$schema' or 'swagger' or 'openapi' at the root of the document.
+        SetSchemaSpecification(document);
+
+        // Generate root schema, it will call CreateSchema() to create sub-schemas,
+        // And call HandleRefSchema() if there are $ref.
+        // PR #1393 use input pointer if supplied
+        root_ = typeless_;
+        if (pointer.GetTokenCount() == 0) {
+            CreateSchemaRecursive(&root_, pointer, document, document, docId_);
+        }
+        else if (const ValueType* v = pointer.Get(document)) {
+            CreateSchema(&root_, pointer, *v, document, docId_);
+        }
+        else {
+            GenericStringBuffer<EncodingType> sb;
+            pointer.StringifyUriFragment(sb);
+            SchemaErrorValue(kSchemaErrorStartUnknown, PointerType(), sb.GetString(), static_cast<SizeType>(sb.GetSize() / sizeof(Ch)));
+        }
+
+        RAPIDJSON_ASSERT(root_ != 0);
+
+        schemaRef_.ShrinkToFit(); // Deallocate all memory for ref
+    }
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    //! Move constructor in C++11
+    GenericSchemaDocument(GenericSchemaDocument&& rhs) RAPIDJSON_NOEXCEPT :
+        remoteProvider_(rhs.remoteProvider_),
+        allocator_(rhs.allocator_),
+        ownAllocator_(rhs.ownAllocator_),
+        root_(rhs.root_),
+        typeless_(rhs.typeless_),
+        schemaMap_(std::move(rhs.schemaMap_)),
+        schemaRef_(std::move(rhs.schemaRef_)),
+        uri_(std::move(rhs.uri_)),
+        docId_(std::move(rhs.docId_)),
+        spec_(rhs.spec_),
+        error_(std::move(rhs.error_)),
+        currentError_(std::move(rhs.currentError_))
+    {
+        rhs.remoteProvider_ = 0;
+        rhs.allocator_ = 0;
+        rhs.ownAllocator_ = 0;
+        rhs.typeless_ = 0;
+    }
+#endif
+
+    //! Destructor
+    ~GenericSchemaDocument() {
+        while (!schemaMap_.Empty())
+            schemaMap_.template Pop<SchemaEntry>(1)->~SchemaEntry();
+
+        if (typeless_) {
+            typeless_->~SchemaType();
+            Allocator::Free(typeless_);
+        }
+
+        // these may contain some allocator data so clear before deleting ownAllocator_
+        uri_.SetNull();
+        error_.SetNull();
+        currentError_.SetNull();
+
+        RAPIDJSON_DELETE(ownAllocator_);
+    }
+
+    const GValue& GetURI() const { return uri_; }
+
+    const Specification& GetSpecification() const { return spec_; }
+    bool IsSupportedSpecification() const { return spec_.IsSupported(); }
+
+    //! Static method to get the specification of any schema document
+    //  Returns kDraftNone if document is silent
+    static const Specification GetSpecification(const ValueType& document) {
+      SchemaDraft draft = GetSchemaDraft(document);
+      if (draft != kDraftNone)
+        return Specification(draft);
+      else {
+        OpenApiVersion oapi = GetOpenApiVersion(document);
+        if (oapi != kVersionNone)
+          return Specification(oapi);
+      }
+      return Specification(kDraftNone);
+    }
+
+    //! Get the root schema.
+    const SchemaType& GetRoot() const { return *root_; }
+
+    //! Gets the error object.
+    GValue& GetError() { return error_; }
+    const GValue& GetError() const { return error_; }
+
+    static const StringRefType& GetSchemaErrorKeyword(SchemaErrorCode schemaErrorCode) {
+        switch (schemaErrorCode) {
+            case kSchemaErrorStartUnknown:             return GetStartUnknownString();
+            case kSchemaErrorRefPlainName:             return GetRefPlainNameString();
+            case kSchemaErrorRefInvalid:               return GetRefInvalidString();
+            case kSchemaErrorRefPointerInvalid:        return GetRefPointerInvalidString();
+            case kSchemaErrorRefUnknown:               return GetRefUnknownString();
+            case kSchemaErrorRefCyclical:              return GetRefCyclicalString();
+            case kSchemaErrorRefNoRemoteProvider:      return GetRefNoRemoteProviderString();
+            case kSchemaErrorRefNoRemoteSchema:        return GetRefNoRemoteSchemaString();
+            case kSchemaErrorRegexInvalid:             return GetRegexInvalidString();
+            case kSchemaErrorSpecUnknown:              return GetSpecUnknownString();
+            case kSchemaErrorSpecUnsupported:          return GetSpecUnsupportedString();
+            case kSchemaErrorSpecIllegal:              return GetSpecIllegalString();
+            case kSchemaErrorReadOnlyAndWriteOnly:     return GetReadOnlyAndWriteOnlyString();
+            default:                                   return GetNullString();
+        }
+    }
+
+    //! Default error method
+    void SchemaError(const SchemaErrorCode code, const PointerType& location) {
+      currentError_ = GValue(kObjectType);
+      AddCurrentError(code, location);
+    }
+
+    //! Method for error with single string value insert
+    void SchemaErrorValue(const SchemaErrorCode code, const PointerType& location, const Ch* value, SizeType length) {
+      currentError_ = GValue(kObjectType);
+      currentError_.AddMember(GetValueString(), GValue(value, length, *allocator_).Move(), *allocator_);
+      AddCurrentError(code, location);
+    }
+
+    //! Method for error with invalid pointer
+    void SchemaErrorPointer(const SchemaErrorCode code, const PointerType& location, const Ch* value, SizeType length, const PointerType& pointer) {
+      currentError_ = GValue(kObjectType);
+      currentError_.AddMember(GetValueString(), GValue(value, length, *allocator_).Move(), *allocator_);
+      currentError_.AddMember(GetOffsetString(), static_cast<SizeType>(pointer.GetParseErrorOffset() / sizeof(Ch)), *allocator_);
+      AddCurrentError(code, location);
+    }
+
+  private:
+    //! Prohibit copying
+    GenericSchemaDocument(const GenericSchemaDocument&);
+    //! Prohibit assignment
+    GenericSchemaDocument& operator=(const GenericSchemaDocument&);
+
+    typedef const PointerType* SchemaRefPtr; // PR #1393
+
+    struct SchemaEntry {
+        SchemaEntry(const PointerType& p, SchemaType* s, bool o, Allocator* allocator) : pointer(p, allocator), schema(s), owned(o) {}
+        ~SchemaEntry() {
+            if (owned) {
+                schema->~SchemaType();
+                Allocator::Free(schema);
+            }
+        }
+        PointerType pointer;
+        SchemaType* schema;
+        bool owned;
+    };
+
+    void AddErrorInstanceLocation(GValue& result, const PointerType& location) {
+      GenericStringBuffer<EncodingType> sb;
+      location.StringifyUriFragment(sb);
+      GValue instanceRef(sb.GetString(), static_cast<SizeType>(sb.GetSize() / sizeof(Ch)), *allocator_);
+      result.AddMember(GetInstanceRefString(), instanceRef, *allocator_);
+    }
+
+    void AddError(GValue& keyword, GValue& error) {
+      typename GValue::MemberIterator member = error_.FindMember(keyword);
+      if (member == error_.MemberEnd())
+        error_.AddMember(keyword, error, *allocator_);
+      else {
+        if (member->value.IsObject()) {
+          GValue errors(kArrayType);
+          errors.PushBack(member->value, *allocator_);
+          member->value = errors;
+        }
+        member->value.PushBack(error, *allocator_);
+      }
+    }
+
+    void AddCurrentError(const SchemaErrorCode code, const PointerType& location) {
+      RAPIDJSON_SCHEMA_PRINT(InvalidKeyword, GetSchemaErrorKeyword(code));
+      currentError_.AddMember(GetErrorCodeString(), code, *allocator_);
+      AddErrorInstanceLocation(currentError_, location);
+      AddError(GValue(GetSchemaErrorKeyword(code)).Move(), currentError_);
+    }
+
+#define RAPIDJSON_STRING_(name, ...) \
+    static const StringRefType& Get##name##String() {\
+        static const Ch s[] = { __VA_ARGS__, '\0' };\
+        static const StringRefType v(s, static_cast<SizeType>(sizeof(s) / sizeof(Ch) - 1)); \
+        return v;\
+    }
+
+    RAPIDJSON_STRING_(InstanceRef, 'i', 'n', 's', 't', 'a', 'n', 'c', 'e', 'R', 'e', 'f')
+    RAPIDJSON_STRING_(ErrorCode, 'e', 'r', 'r', 'o', 'r', 'C', 'o', 'd', 'e')
+    RAPIDJSON_STRING_(Value, 'v', 'a', 'l', 'u', 'e')
+    RAPIDJSON_STRING_(Offset, 'o', 'f', 'f', 's', 'e', 't')
+
+    RAPIDJSON_STRING_(Null, 'n', 'u', 'l', 'l')
+    RAPIDJSON_STRING_(SpecUnknown, 'S', 'p', 'e', 'c', 'U', 'n', 'k', 'n', 'o', 'w', 'n')
+    RAPIDJSON_STRING_(SpecUnsupported, 'S', 'p', 'e', 'c', 'U', 'n', 's', 'u', 'p', 'p', 'o', 'r', 't', 'e', 'd')
+    RAPIDJSON_STRING_(SpecIllegal, 'S', 'p', 'e', 'c', 'I', 'l', 'l', 'e', 'g', 'a', 'l')
+    RAPIDJSON_STRING_(StartUnknown, 'S', 't', 'a', 'r', 't', 'U', 'n', 'k', 'n', 'o', 'w', 'n')
+    RAPIDJSON_STRING_(RefPlainName, 'R', 'e', 'f', 'P', 'l', 'a', 'i', 'n', 'N', 'a', 'm', 'e')
+    RAPIDJSON_STRING_(RefInvalid, 'R', 'e', 'f', 'I', 'n', 'v', 'a', 'l', 'i', 'd')
+    RAPIDJSON_STRING_(RefPointerInvalid, 'R', 'e', 'f', 'P', 'o', 'i', 'n', 't', 'e', 'r', 'I', 'n', 'v', 'a', 'l', 'i', 'd')
+    RAPIDJSON_STRING_(RefUnknown, 'R', 'e', 'f', 'U', 'n', 'k', 'n', 'o', 'w', 'n')
+    RAPIDJSON_STRING_(RefCyclical, 'R', 'e', 'f', 'C', 'y', 'c', 'l', 'i', 'c', 'a', 'l')
+    RAPIDJSON_STRING_(RefNoRemoteProvider, 'R', 'e', 'f', 'N', 'o', 'R', 'e', 'm', 'o', 't', 'e', 'P', 'r', 'o', 'v', 'i', 'd', 'e', 'r')
+    RAPIDJSON_STRING_(RefNoRemoteSchema, 'R', 'e', 'f', 'N', 'o', 'R', 'e', 'm', 'o', 't', 'e', 'S', 'c', 'h', 'e', 'm', 'a')
+    RAPIDJSON_STRING_(ReadOnlyAndWriteOnly, 'R', 'e', 'a', 'd', 'O', 'n', 'l', 'y', 'A', 'n', 'd', 'W', 'r', 'i', 't', 'e', 'O', 'n', 'l', 'y')
+    RAPIDJSON_STRING_(RegexInvalid, 'R', 'e', 'g', 'e', 'x', 'I', 'n', 'v', 'a', 'l', 'i', 'd')
+
+#undef RAPIDJSON_STRING_
+
+    // Static method to get schema draft of any schema document
+    static SchemaDraft GetSchemaDraft(const ValueType& document) {
+        static const Ch kDraft03String[] = { 'h', 't', 't', 'p', ':', '/', '/', 'j', 's', 'o', 'n', '-', 's', 'c', 'h', 'e', 'm', 'a', '.', 'o', 'r', 'g', '/', 'd', 'r', 'a', 'f', 't', '-', '0', '3', '/', 's', 'c', 'h', 'e', 'm', 'a', '#', '\0' };
+        static const Ch kDraft04String[] = { 'h', 't', 't', 'p', ':', '/', '/', 'j', 's', 'o', 'n', '-', 's', 'c', 'h', 'e', 'm', 'a', '.', 'o', 'r', 'g', '/', 'd', 'r', 'a', 'f', 't', '-', '0', '4', '/', 's', 'c', 'h', 'e', 'm', 'a', '#', '\0' };
+        static const Ch kDraft05String[] = { 'h', 't', 't', 'p', ':', '/', '/', 'j', 's', 'o', 'n', '-', 's', 'c', 'h', 'e', 'm', 'a', '.', 'o', 'r', 'g', '/', 'd', 'r', 'a', 'f', 't', '-', '0', '5', '/', 's', 'c', 'h', 'e', 'm', 'a', '#', '\0' };
+        static const Ch kDraft06String[] = { 'h', 't', 't', 'p', ':', '/', '/', 'j', 's', 'o', 'n', '-', 's', 'c', 'h', 'e', 'm', 'a', '.', 'o', 'r', 'g', '/', 'd', 'r', 'a', 'f', 't', '-', '0', '6', '/', 's', 'c', 'h', 'e', 'm', 'a', '#', '\0' };
+        static const Ch kDraft07String[] = { 'h', 't', 't', 'p', ':', '/', '/', 'j', 's', 'o', 'n', '-', 's', 'c', 'h', 'e', 'm', 'a', '.', 'o', 'r', 'g', '/', 'd', 'r', 'a', 'f', 't', '-', '0', '7', '/', 's', 'c', 'h', 'e', 'm', 'a', '#', '\0' };
+        static const Ch kDraft2019_09String[] = { 'h', 't', 't', 'p', 's', ':', '/', '/', 'j', 's', 'o', 'n', '-', 's', 'c', 'h', 'e', 'm', 'a', '.', 'o', 'r', 'g', '/', 'd', 'r', 'a', 'f', 't', '/', '2', '0', '1', '9', '-', '0', '9', '/', 's', 'c', 'h', 'e', 'm', 'a', '\0' };
+        static const Ch kDraft2020_12String[] = { 'h', 't', 't', 'p', 's', ':', '/', '/', 'j', 's', 'o', 'n', '-', 's', 'c', 'h', 'e', 'm', 'a', '.', 'o', 'r', 'g', '/', 'd', 'r', 'a', 'f', 't', '/', '2', '0', '2', '0', '-', '1', '2', '/', 's', 'c', 'h', 'e', 'm', 'a', '\0' };
+
+        if (!document.IsObject()) {
+            return kDraftNone;
+        }
+
+        // Get the schema draft from the $schema keyword at the supplied location
+        typename ValueType::ConstMemberIterator itr = document.FindMember(SchemaType::GetSchemaString());
+        if (itr != document.MemberEnd()) {
+            if (!itr->value.IsString()) return kDraftUnknown;
+            const UriType draftUri(itr->value);
+            // Check base uri for match
+            if (draftUri.Match(UriType(kDraft04String), false)) return kDraft04;
+            if (draftUri.Match(UriType(kDraft05String), false)) return kDraft05;
+            if (draftUri.Match(UriType(kDraft06String), false)) return kDraft06;
+            if (draftUri.Match(UriType(kDraft07String), false)) return kDraft07;
+            if (draftUri.Match(UriType(kDraft03String), false)) return kDraft03;
+            if (draftUri.Match(UriType(kDraft2019_09String), false)) return kDraft2019_09;
+            if (draftUri.Match(UriType(kDraft2020_12String), false)) return kDraft2020_12;
+            return kDraftUnknown;
+        }
+        // $schema not found
+        return kDraftNone;
+    }
+
+
+    // Get open api version of any schema document
+    static OpenApiVersion GetOpenApiVersion(const ValueType& document) {
+        static const Ch kVersion20String[] = { '2', '.', '0', '\0' };
+        static const Ch kVersion30String[] = { '3', '.', '0', '.', '\0' }; // ignore patch level
+        static const Ch kVersion31String[] = { '3', '.', '1', '.', '\0' }; // ignore patch level
+        static SizeType len = internal::StrLen<Ch>(kVersion30String);
+
+        if (!document.IsObject()) {
+            return kVersionNone;
+        }
+
+        // Get the open api version from the swagger / openapi keyword at the supplied location
+        typename ValueType::ConstMemberIterator itr = document.FindMember(SchemaType::GetSwaggerString());
+        if (itr == document.MemberEnd()) itr = document.FindMember(SchemaType::GetOpenApiString());
+        if (itr != document.MemberEnd()) {
+            if (!itr->value.IsString()) return kVersionUnknown;
+            const ValueType kVersion20Value(kVersion20String);
+            if (kVersion20Value == itr->value) return kVersion20; // must match 2.0 exactly
+            const ValueType kVersion30Value(kVersion30String);
+            if (itr->value.GetStringLength() > len && kVersion30Value == ValueType(itr->value.GetString(), len)) return kVersion30; // must match 3.0.x
+            const ValueType kVersion31Value(kVersion31String);
+            if (itr->value.GetStringLength() > len && kVersion31Value == ValueType(itr->value.GetString(), len)) return kVersion31; // must match 3.1.x
+            return kVersionUnknown;
+        }
+        // swagger or openapi not found
+        return kVersionNone;
+    }
+
+    // Get the draft of the schema or the open api version (which implies the draft).
+    // Report an error if schema draft or open api version not supported or not recognized, or both in document, and carry on.
+    void SetSchemaSpecification(const ValueType& document) {
+        // Look for '$schema', 'swagger' or 'openapi' keyword at document root
+        SchemaDraft docDraft = GetSchemaDraft(document);
+        OpenApiVersion docOapi = GetOpenApiVersion(document);
+        // Error if both in document
+        if (docDraft != kDraftNone && docOapi != kVersionNone)
+          SchemaError(kSchemaErrorSpecIllegal, PointerType());
+        // Use document draft or open api version if present or use spec from constructor
+        if (docDraft != kDraftNone)
+            spec_ = Specification(docDraft);
+        else if (docOapi != kVersionNone)
+            spec_ = Specification(docOapi);
+        // Error if draft or version unknown
+        if (spec_.draft == kDraftUnknown || spec_.oapi == kVersionUnknown)
+          SchemaError(kSchemaErrorSpecUnknown, PointerType());
+        else if (!spec_.IsSupported())
+            SchemaError(kSchemaErrorSpecUnsupported, PointerType());
+    }
+
+    // Changed by PR #1393
+    void CreateSchemaRecursive(const SchemaType** schema, const PointerType& pointer, const ValueType& v, const ValueType& document, const UriType& id) {
+        if (v.GetType() == kObjectType) {
+            UriType newid = UriType(CreateSchema(schema, pointer, v, document, id), allocator_);
+
+            for (typename ValueType::ConstMemberIterator itr = v.MemberBegin(); itr != v.MemberEnd(); ++itr)
+                CreateSchemaRecursive(0, pointer.Append(itr->name, allocator_), itr->value, document, newid);
+        }
+        else if (v.GetType() == kArrayType)
+            for (SizeType i = 0; i < v.Size(); i++)
+                CreateSchemaRecursive(0, pointer.Append(i, allocator_), v[i], document, id);
+    }
+
+    // Changed by PR #1393
+    const UriType& CreateSchema(const SchemaType** schema, const PointerType& pointer, const ValueType& v, const ValueType& document, const UriType& id) {
+        RAPIDJSON_ASSERT(pointer.IsValid());
+        GenericStringBuffer<EncodingType> sb;
+        pointer.StringifyUriFragment(sb);
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaDocument::CreateSchema", sb.GetString(), id.GetString());
+        if (v.IsObject()) {
+            if (const SchemaType* sc = GetSchema(pointer)) {
+                if (schema)
+                    *schema = sc;
+                AddSchemaRefs(const_cast<SchemaType*>(sc));
+            }
+            else if (!HandleRefSchema(pointer, schema, v, document, id)) {
+                // The new schema constructor adds itself and its $ref(s) to schemaMap_
+                SchemaType* s = new (allocator_->Malloc(sizeof(SchemaType))) SchemaType(this, pointer, v, document, allocator_, id);
+                if (schema)
+                    *schema = s;
+                return s->GetId();
+            }
+        }
+        else {
+            if (schema)
+                *schema = typeless_;
+            AddSchemaRefs(typeless_);
+        }
+        return id;
+    }
+
+    // Changed by PR #1393
+    // TODO should this return a UriType& ?
+    bool HandleRefSchema(const PointerType& source, const SchemaType** schema, const ValueType& v, const ValueType& document, const UriType& id) {
+        typename ValueType::ConstMemberIterator itr = v.FindMember(SchemaType::GetRefString());
+        if (itr == v.MemberEnd())
+            return false;
+
+        GenericStringBuffer<EncodingType> sb;
+        source.StringifyUriFragment(sb);
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaDocument::HandleRefSchema", sb.GetString(), id.GetString());
+        // Resolve the source pointer to the $ref'ed schema (finally)
+        new (schemaRef_.template Push<SchemaRefPtr>()) SchemaRefPtr(&source);
+
+        if (itr->value.IsString()) {
+            SizeType len = itr->value.GetStringLength();
+            if (len == 0)
+                SchemaError(kSchemaErrorRefInvalid, source);
+            else {
+                // First resolve $ref against the in-scope id
+                UriType scopeId = UriType(id, allocator_);
+                UriType ref = UriType(itr->value, allocator_).Resolve(scopeId, allocator_);
+                RAPIDJSON_SCHEMA_PRINT(SchemaIds, id.GetString(), itr->value.GetString(), ref.GetString());
+                // See if the resolved $ref minus the fragment matches a resolved id in this document
+                // Search from the root. Returns the subschema in the document and its absolute JSON pointer.
+                PointerType basePointer = PointerType();
+                const ValueType *base = FindId(document, ref, basePointer, docId_, false);
+                if (!base) {
+                    // Remote reference - call the remote document provider
+                    if (!remoteProvider_)
+                        SchemaError(kSchemaErrorRefNoRemoteProvider, source);
+                    else {
+                        if (const GenericSchemaDocument* remoteDocument = remoteProvider_->GetRemoteDocument(ref, spec_)) {
+                            const Ch* s = ref.GetFragString();
+                            len = ref.GetFragStringLength();
+                            if (len <= 1 || s[1] == '/') {
+                                // JSON pointer fragment, absolute in the remote schema
+                                const PointerType pointer(s, len, allocator_);
+                                if (!pointer.IsValid())
+                                    SchemaErrorPointer(kSchemaErrorRefPointerInvalid, source, s, len, pointer);
+                                else {
+                                    // Get the subschema
+                                    if (const SchemaType *sc = remoteDocument->GetSchema(pointer)) {
+                                        if (schema)
+                                            *schema = sc;
+                                        AddSchemaRefs(const_cast<SchemaType *>(sc));
+                                        return true;
+                                    } else
+                                        SchemaErrorValue(kSchemaErrorRefUnknown, source, ref.GetString(), ref.GetStringLength());
+                                }
+                            } else
+                                // Plain name fragment, not allowed in remote schema
+                                SchemaErrorValue(kSchemaErrorRefPlainName, source, s, len);
+                        } else
+                          SchemaErrorValue(kSchemaErrorRefNoRemoteSchema, source, ref.GetString(), ref.GetStringLength());
+                    }
+                }
+                else { // Local reference
+                    const Ch* s = ref.GetFragString();
+                    len = ref.GetFragStringLength();
+                    if (len <= 1 || s[1] == '/') {
+                        // JSON pointer fragment, relative to the resolved URI
+                        const PointerType relPointer(s, len, allocator_);
+                        if (!relPointer.IsValid())
+                            SchemaErrorPointer(kSchemaErrorRefPointerInvalid, source, s, len, relPointer);
+                        else {
+                            // Get the subschema
+                            if (const ValueType *pv = relPointer.Get(*base)) {
+                                // Now get the absolute JSON pointer by adding relative to base
+                                PointerType pointer(basePointer, allocator_);
+                                for (SizeType i = 0; i < relPointer.GetTokenCount(); i++)
+                                    pointer = pointer.Append(relPointer.GetTokens()[i], allocator_);
+                                if (IsCyclicRef(pointer))
+                                    SchemaErrorValue(kSchemaErrorRefCyclical, source, ref.GetString(), ref.GetStringLength());
+                                else {
+                                    // Call CreateSchema recursively, but first compute the in-scope id for the $ref target as we have jumped there
+                                    // TODO: cache pointer <-> id mapping
+                                    size_t unresolvedTokenIndex;
+                                    scopeId = pointer.GetUri(document, docId_, &unresolvedTokenIndex, allocator_);
+                                    CreateSchema(schema, pointer, *pv, document, scopeId);
+                                    return true;
+                                }
+                            } else
+                                SchemaErrorValue(kSchemaErrorRefUnknown, source, ref.GetString(), ref.GetStringLength());
+                        }
+                    } else {
+                        // Plain name fragment, relative to the resolved URI
+                        // Not supported in open api 2.0 and 3.0
+                        PointerType pointer(allocator_);
+                        if (spec_.oapi == kVersion20 || spec_.oapi == kVersion30)
+                            SchemaErrorValue(kSchemaErrorRefPlainName, source, s, len);
+                        // See if the fragment matches an id in this document.
+                        // Search from the base we just established. Returns the subschema in the document and its absolute JSON pointer.
+                        else if (const ValueType *pv = FindId(*base, ref, pointer, UriType(ref.GetBaseString(), ref.GetBaseStringLength(), allocator_), true, basePointer)) {
+                            if (IsCyclicRef(pointer))
+                                SchemaErrorValue(kSchemaErrorRefCyclical, source, ref.GetString(), ref.GetStringLength());
+                            else {
+                                // Call CreateSchema recursively, but first compute the in-scope id for the $ref target as we have jumped there
+                                // TODO: cache pointer <-> id mapping
+                                size_t unresolvedTokenIndex;
+                                scopeId = pointer.GetUri(document, docId_, &unresolvedTokenIndex, allocator_);
+                                CreateSchema(schema, pointer, *pv, document, scopeId);
+                                return true;
+                            }
+                        } else
+                            SchemaErrorValue(kSchemaErrorRefUnknown, source, ref.GetString(), ref.GetStringLength());
+                    }
+                }
+            }
+        }
+
+        // Invalid/Unknown $ref
+        if (schema)
+            *schema = typeless_;
+        AddSchemaRefs(typeless_);
+        return true;
+    }
+
+    //! Find the first subschema with a resolved 'id' that matches the specified URI.
+    // If full specified use all URI else ignore fragment.
+    // If found, return a pointer to the subschema and its JSON pointer.
+    // TODO cache pointer <-> id mapping
+    ValueType* FindId(const ValueType& doc, const UriType& finduri, PointerType& resptr, const UriType& baseuri, bool full, const PointerType& here = PointerType()) const {
+        SizeType i = 0;
+        ValueType* resval = 0;
+        UriType tempuri = UriType(finduri, allocator_);
+        UriType localuri = UriType(baseuri, allocator_);
+        if (doc.GetType() == kObjectType) {
+            // Establish the base URI of this object
+            typename ValueType::ConstMemberIterator m = doc.FindMember(SchemaType::GetIdString());
+            if (m != doc.MemberEnd() && m->value.GetType() == kStringType) {
+                localuri = UriType(m->value, allocator_).Resolve(baseuri, allocator_);
+            }
+            // See if it matches
+            if (localuri.Match(finduri, full)) {
+                RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaDocument::FindId (match)", full ? localuri.GetString() : localuri.GetBaseString());
+                resval = const_cast<ValueType *>(&doc);
+                resptr = here;
+                return resval;
+            }
+            // No match, continue looking
+            for (m = doc.MemberBegin(); m != doc.MemberEnd(); ++m) {
+                if (m->value.GetType() == kObjectType || m->value.GetType() == kArrayType) {
+                    resval = FindId(m->value, finduri, resptr, localuri, full, here.Append(m->name.GetString(), m->name.GetStringLength(), allocator_));
+                }
+                if (resval) break;
+            }
+        } else if (doc.GetType() == kArrayType) {
+            // Continue looking
+            for (typename ValueType::ConstValueIterator v = doc.Begin(); v != doc.End(); ++v) {
+                if (v->GetType() == kObjectType || v->GetType() == kArrayType) {
+                    resval = FindId(*v, finduri, resptr, localuri, full, here.Append(i, allocator_));
+                }
+                if (resval) break;
+                i++;
+            }
+        }
+        return resval;
+    }
+
+    // Added by PR #1393
+    void AddSchemaRefs(SchemaType* schema) {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaDocument::AddSchemaRefs");
+        while (!schemaRef_.Empty()) {
+            SchemaRefPtr *ref = schemaRef_.template Pop<SchemaRefPtr>(1);
+            SchemaEntry *entry = schemaMap_.template Push<SchemaEntry>();
+            new (entry) SchemaEntry(**ref, schema, false, allocator_);
+        }
+    }
+
+    // Added by PR #1393
+    bool IsCyclicRef(const PointerType& pointer) const {
+        for (const SchemaRefPtr* ref = schemaRef_.template Bottom<SchemaRefPtr>(); ref != schemaRef_.template End<SchemaRefPtr>(); ++ref)
+            if (pointer == **ref)
+                return true;
+        return false;
+    }
+
+    const SchemaType* GetSchema(const PointerType& pointer) const {
+        for (const SchemaEntry* target = schemaMap_.template Bottom<SchemaEntry>(); target != schemaMap_.template End<SchemaEntry>(); ++target)
+            if (pointer == target->pointer)
+                return target->schema;
+        return 0;
+    }
+
+    PointerType GetPointer(const SchemaType* schema) const {
+        for (const SchemaEntry* target = schemaMap_.template Bottom<SchemaEntry>(); target != schemaMap_.template End<SchemaEntry>(); ++target)
+            if (schema == target->schema)
+                return target->pointer;
+        return PointerType();
+    }
+
+    const SchemaType* GetTypeless() const { return typeless_; }
+
+    static const size_t kInitialSchemaMapSize = 64;
+    static const size_t kInitialSchemaRefSize = 64;
+
+    IRemoteSchemaDocumentProviderType* remoteProvider_;
+    Allocator *allocator_;
+    Allocator *ownAllocator_;
+    const SchemaType* root_;                //!< Root schema.
+    SchemaType* typeless_;
+    internal::Stack<Allocator> schemaMap_;  // Stores created Pointer -> Schemas
+    internal::Stack<Allocator> schemaRef_;  // Stores Pointer(s) from $ref(s) until resolved
+    GValue uri_;                            // Schema document URI
+    UriType docId_;
+    Specification spec_;
+    GValue error_;
+    GValue currentError_;
+};
+
+//! GenericSchemaDocument using Value type.
+typedef GenericSchemaDocument<Value> SchemaDocument;
+//! IGenericRemoteSchemaDocumentProvider using SchemaDocument.
+typedef IGenericRemoteSchemaDocumentProvider<SchemaDocument> IRemoteSchemaDocumentProvider;
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericSchemaValidator
+
+//! JSON Schema Validator.
+/*!
+    A SAX style JSON schema validator.
+    It uses a \c GenericSchemaDocument to validate SAX events.
+    It delegates the incoming SAX events to an output handler.
+    The default output handler does nothing.
+    It can be reused multiple times by calling \c Reset().
+
+    \tparam SchemaDocumentType Type of schema document.
+    \tparam OutputHandler Type of output handler. Default handler does nothing.
+    \tparam StateAllocator Allocator for storing the internal validation states.
+*/
+template <
+    typename SchemaDocumentType,
+    typename OutputHandler = BaseReaderHandler<typename SchemaDocumentType::SchemaType::EncodingType>,
+    typename StateAllocator = CrtAllocator>
+class GenericSchemaValidator :
+    public internal::ISchemaStateFactory<typename SchemaDocumentType::SchemaType>, 
+    public internal::ISchemaValidator,
+    public internal::IValidationErrorHandler<typename SchemaDocumentType::SchemaType> {
+public:
+    typedef typename SchemaDocumentType::SchemaType SchemaType;
+    typedef typename SchemaDocumentType::PointerType PointerType;
+    typedef typename SchemaType::EncodingType EncodingType;
+    typedef typename SchemaType::SValue SValue;
+    typedef typename EncodingType::Ch Ch;
+    typedef GenericStringRef<Ch> StringRefType;
+    typedef GenericValue<EncodingType, StateAllocator> ValueType;
+
+    //! Constructor without output handler.
+    /*!
+        \param schemaDocument The schema document to conform to.
+        \param allocator Optional allocator for storing internal validation states.
+        \param schemaStackCapacity Optional initial capacity of schema path stack.
+        \param documentStackCapacity Optional initial capacity of document path stack.
+    */
+    GenericSchemaValidator(
+        const SchemaDocumentType& schemaDocument,
+        StateAllocator* allocator = 0, 
+        size_t schemaStackCapacity = kDefaultSchemaStackCapacity,
+        size_t documentStackCapacity = kDefaultDocumentStackCapacity)
+        :
+        schemaDocument_(&schemaDocument),
+        root_(schemaDocument.GetRoot()),
+        stateAllocator_(allocator),
+        ownStateAllocator_(0),
+        schemaStack_(allocator, schemaStackCapacity),
+        documentStack_(allocator, documentStackCapacity),
+        outputHandler_(0),
+        error_(kObjectType),
+        currentError_(),
+        missingDependents_(),
+        valid_(true),
+        flags_(kValidateDefaultFlags),
+        depth_(0)
+    {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::GenericSchemaValidator");
+    }
+
+    //! Constructor with output handler.
+    /*!
+        \param schemaDocument The schema document to conform to.
+        \param allocator Optional allocator for storing internal validation states.
+        \param schemaStackCapacity Optional initial capacity of schema path stack.
+        \param documentStackCapacity Optional initial capacity of document path stack.
+    */
+    GenericSchemaValidator(
+        const SchemaDocumentType& schemaDocument,
+        OutputHandler& outputHandler,
+        StateAllocator* allocator = 0, 
+        size_t schemaStackCapacity = kDefaultSchemaStackCapacity,
+        size_t documentStackCapacity = kDefaultDocumentStackCapacity)
+        :
+        schemaDocument_(&schemaDocument),
+        root_(schemaDocument.GetRoot()),
+        stateAllocator_(allocator),
+        ownStateAllocator_(0),
+        schemaStack_(allocator, schemaStackCapacity),
+        documentStack_(allocator, documentStackCapacity),
+        outputHandler_(&outputHandler),
+        error_(kObjectType),
+        currentError_(),
+        missingDependents_(),
+        valid_(true),
+        flags_(kValidateDefaultFlags),
+        depth_(0)
+    {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::GenericSchemaValidator (output handler)");
+    }
+
+    //! Destructor.
+    ~GenericSchemaValidator() {
+        Reset();
+        RAPIDJSON_DELETE(ownStateAllocator_);
+    }
+
+    //! Reset the internal states.
+    void Reset() {
+        while (!schemaStack_.Empty())
+            PopSchema();
+        documentStack_.Clear();
+        ResetError();
+    }
+
+    //! Reset the error state.
+    void ResetError() {
+        error_.SetObject();
+        currentError_.SetNull();
+        missingDependents_.SetNull();
+        valid_ = true;
+    }
+
+    //! Implementation of ISchemaValidator
+    void SetValidateFlags(unsigned flags) {
+        flags_ = flags;
+    }
+    virtual unsigned GetValidateFlags() const {
+        return flags_;
+    }
+
+    virtual bool IsValid() const {
+        if (!valid_) return false;
+        if (GetContinueOnErrors() && !error_.ObjectEmpty()) return false;
+        return true;
+    }
+    //! End of Implementation of ISchemaValidator
+
+    //! Gets the error object.
+    ValueType& GetError() { return error_; }
+    const ValueType& GetError() const { return error_; }
+
+    //! Gets the JSON pointer pointed to the invalid schema.
+    //  If reporting all errors, the stack will be empty.
+    PointerType GetInvalidSchemaPointer() const {
+        return schemaStack_.Empty() ? PointerType() : CurrentSchema().GetPointer();
+    }
+
+    //! Gets the keyword of invalid schema.
+    //  If reporting all errors, the stack will be empty, so return "errors".
+    const Ch* GetInvalidSchemaKeyword() const {
+        if (!schemaStack_.Empty()) return CurrentContext().invalidKeyword;
+        if (GetContinueOnErrors() && !error_.ObjectEmpty()) return static_cast<const Ch*>(GetErrorsString());
+        return 0;
+    }
+
+    //! Gets the error code of invalid schema.
+    //  If reporting all errors, the stack will be empty, so return kValidateErrors.
+    ValidateErrorCode GetInvalidSchemaCode() const {
+        if (!schemaStack_.Empty()) return CurrentContext().invalidCode;
+        if (GetContinueOnErrors() && !error_.ObjectEmpty()) return kValidateErrors;
+        return kValidateErrorNone;
+    }
+
+    //! Gets the JSON pointer pointed to the invalid value.
+    //  If reporting all errors, the stack will be empty.
+    PointerType GetInvalidDocumentPointer() const {
+        if (documentStack_.Empty()) {
+            return PointerType();
+        }
+        else {
+            return PointerType(documentStack_.template Bottom<Ch>(), documentStack_.GetSize() / sizeof(Ch));
+        }
+    }
+
+    void NotMultipleOf(int64_t actual, const SValue& expected) {
+        AddNumberError(kValidateErrorMultipleOf, ValueType(actual).Move(), expected);
+    }
+    void NotMultipleOf(uint64_t actual, const SValue& expected) {
+        AddNumberError(kValidateErrorMultipleOf, ValueType(actual).Move(), expected);
+    }
+    void NotMultipleOf(double actual, const SValue& expected) {
+        AddNumberError(kValidateErrorMultipleOf, ValueType(actual).Move(), expected);
+    }
+    void AboveMaximum(int64_t actual, const SValue& expected, bool exclusive) {
+        AddNumberError(exclusive ? kValidateErrorExclusiveMaximum : kValidateErrorMaximum, ValueType(actual).Move(), expected,
+            exclusive ? &SchemaType::GetExclusiveMaximumString : 0);
+    }
+    void AboveMaximum(uint64_t actual, const SValue& expected, bool exclusive) {
+        AddNumberError(exclusive ? kValidateErrorExclusiveMaximum : kValidateErrorMaximum, ValueType(actual).Move(), expected,
+            exclusive ? &SchemaType::GetExclusiveMaximumString : 0);
+    }
+    void AboveMaximum(double actual, const SValue& expected, bool exclusive) {
+        AddNumberError(exclusive ? kValidateErrorExclusiveMaximum : kValidateErrorMaximum, ValueType(actual).Move(), expected,
+            exclusive ? &SchemaType::GetExclusiveMaximumString : 0);
+    }
+    void BelowMinimum(int64_t actual, const SValue& expected, bool exclusive) {
+        AddNumberError(exclusive ? kValidateErrorExclusiveMinimum : kValidateErrorMinimum, ValueType(actual).Move(), expected,
+            exclusive ? &SchemaType::GetExclusiveMinimumString : 0);
+    }
+    void BelowMinimum(uint64_t actual, const SValue& expected, bool exclusive) {
+        AddNumberError(exclusive ? kValidateErrorExclusiveMinimum : kValidateErrorMinimum, ValueType(actual).Move(), expected,
+            exclusive ? &SchemaType::GetExclusiveMinimumString : 0);
+    }
+    void BelowMinimum(double actual, const SValue& expected, bool exclusive) {
+        AddNumberError(exclusive ? kValidateErrorExclusiveMinimum : kValidateErrorMinimum, ValueType(actual).Move(), expected,
+            exclusive ? &SchemaType::GetExclusiveMinimumString : 0);
+    }
+
+    void TooLong(const Ch* str, SizeType length, SizeType expected) {
+        AddNumberError(kValidateErrorMaxLength,
+            ValueType(str, length, GetStateAllocator()).Move(), SValue(expected).Move());
+    }
+    void TooShort(const Ch* str, SizeType length, SizeType expected) {
+        AddNumberError(kValidateErrorMinLength,
+            ValueType(str, length, GetStateAllocator()).Move(), SValue(expected).Move());
+    }
+    void DoesNotMatch(const Ch* str, SizeType length) {
+        currentError_.SetObject();
+        currentError_.AddMember(GetActualString(), ValueType(str, length, GetStateAllocator()).Move(), GetStateAllocator());
+        AddCurrentError(kValidateErrorPattern);
+    }
+
+    void DisallowedItem(SizeType index) {
+        currentError_.SetObject();
+        currentError_.AddMember(GetDisallowedString(), ValueType(index).Move(), GetStateAllocator());
+        AddCurrentError(kValidateErrorAdditionalItems, true);
+    }
+    void TooFewItems(SizeType actualCount, SizeType expectedCount) {
+        AddNumberError(kValidateErrorMinItems,
+            ValueType(actualCount).Move(), SValue(expectedCount).Move());
+    }
+    void TooManyItems(SizeType actualCount, SizeType expectedCount) {
+        AddNumberError(kValidateErrorMaxItems,
+            ValueType(actualCount).Move(), SValue(expectedCount).Move());
+    }
+    void DuplicateItems(SizeType index1, SizeType index2) {
+        ValueType duplicates(kArrayType);
+        duplicates.PushBack(index1, GetStateAllocator());
+        duplicates.PushBack(index2, GetStateAllocator());
+        currentError_.SetObject();
+        currentError_.AddMember(GetDuplicatesString(), duplicates, GetStateAllocator());
+        AddCurrentError(kValidateErrorUniqueItems, true);
+    }
+
+    void TooManyProperties(SizeType actualCount, SizeType expectedCount) {
+        AddNumberError(kValidateErrorMaxProperties,
+            ValueType(actualCount).Move(), SValue(expectedCount).Move());
+    }
+    void TooFewProperties(SizeType actualCount, SizeType expectedCount) {
+        AddNumberError(kValidateErrorMinProperties,
+            ValueType(actualCount).Move(), SValue(expectedCount).Move());
+    }
+    void StartMissingProperties() {
+        currentError_.SetArray();
+    }
+    void AddMissingProperty(const SValue& name) {
+        currentError_.PushBack(ValueType(name, GetStateAllocator()).Move(), GetStateAllocator());
+    }
+    bool EndMissingProperties() {
+        if (currentError_.Empty())
+            return false;
+        ValueType error(kObjectType);
+        error.AddMember(GetMissingString(), currentError_, GetStateAllocator());
+        currentError_ = error;
+        AddCurrentError(kValidateErrorRequired);
+        return true;
+    }
+    void PropertyViolations(ISchemaValidator** subvalidators, SizeType count) {
+        for (SizeType i = 0; i < count; ++i)
+            MergeError(static_cast<GenericSchemaValidator*>(subvalidators[i])->GetError());
+    }
+    void DisallowedProperty(const Ch* name, SizeType length) {
+        currentError_.SetObject();
+        currentError_.AddMember(GetDisallowedString(), ValueType(name, length, GetStateAllocator()).Move(), GetStateAllocator());
+        AddCurrentError(kValidateErrorAdditionalProperties, true);
+    }
+
+    void StartDependencyErrors() {
+        currentError_.SetObject();
+    }
+    void StartMissingDependentProperties() {
+        missingDependents_.SetArray();
+    }
+    void AddMissingDependentProperty(const SValue& targetName) {
+        missingDependents_.PushBack(ValueType(targetName, GetStateAllocator()).Move(), GetStateAllocator());
+    }
+    void EndMissingDependentProperties(const SValue& sourceName) {
+        if (!missingDependents_.Empty()) {
+            // Create equivalent 'required' error
+            ValueType error(kObjectType);
+            ValidateErrorCode code = kValidateErrorRequired;
+            error.AddMember(GetMissingString(), missingDependents_.Move(), GetStateAllocator());
+            AddErrorCode(error, code);
+            AddErrorInstanceLocation(error, false);
+            // When appending to a pointer ensure its allocator is used
+            PointerType schemaRef = GetInvalidSchemaPointer().Append(SchemaType::GetValidateErrorKeyword(kValidateErrorDependencies), &GetInvalidSchemaPointer().GetAllocator());
+            AddErrorSchemaLocation(error, schemaRef.Append(sourceName.GetString(), sourceName.GetStringLength(), &GetInvalidSchemaPointer().GetAllocator()));
+            ValueType wrapper(kObjectType);
+            wrapper.AddMember(ValueType(SchemaType::GetValidateErrorKeyword(code), GetStateAllocator()).Move(), error, GetStateAllocator());
+            currentError_.AddMember(ValueType(sourceName, GetStateAllocator()).Move(), wrapper, GetStateAllocator());
+        }
+    }
+    void AddDependencySchemaError(const SValue& sourceName, ISchemaValidator* subvalidator) {
+        currentError_.AddMember(ValueType(sourceName, GetStateAllocator()).Move(),
+            static_cast<GenericSchemaValidator*>(subvalidator)->GetError(), GetStateAllocator());
+    }
+    bool EndDependencyErrors() {
+        if (currentError_.ObjectEmpty())
+            return false;
+        ValueType error(kObjectType);
+        error.AddMember(GetErrorsString(), currentError_, GetStateAllocator());
+        currentError_ = error;
+        AddCurrentError(kValidateErrorDependencies);
+        return true;
+    }
+
+    void DisallowedValue(const ValidateErrorCode code = kValidateErrorEnum) {
+        currentError_.SetObject();
+        AddCurrentError(code);
+    }
+    void StartDisallowedType() {
+        currentError_.SetArray();
+    }
+    void AddExpectedType(const typename SchemaType::ValueType& expectedType) {
+        currentError_.PushBack(ValueType(expectedType, GetStateAllocator()).Move(), GetStateAllocator());
+    }
+    void EndDisallowedType(const typename SchemaType::ValueType& actualType) {
+        ValueType error(kObjectType);
+        error.AddMember(GetExpectedString(), currentError_, GetStateAllocator());
+        error.AddMember(GetActualString(), ValueType(actualType, GetStateAllocator()).Move(), GetStateAllocator());
+        currentError_ = error;
+        AddCurrentError(kValidateErrorType);
+    }
+    void NotAllOf(ISchemaValidator** subvalidators, SizeType count) {
+        // Treat allOf like oneOf and anyOf to match https://rapidjson.org/md_doc_schema.html#allOf-anyOf-oneOf
+        AddErrorArray(kValidateErrorAllOf, subvalidators, count);
+        //for (SizeType i = 0; i < count; ++i) {
+        //    MergeError(static_cast<GenericSchemaValidator*>(subvalidators[i])->GetError());
+        //}
+    }
+    void NoneOf(ISchemaValidator** subvalidators, SizeType count) {
+        AddErrorArray(kValidateErrorAnyOf, subvalidators, count);
+    }
+    void NotOneOf(ISchemaValidator** subvalidators, SizeType count) {
+        AddErrorArray(kValidateErrorOneOf, subvalidators, count);
+    }
+    void MultipleOneOf(SizeType index1, SizeType index2) {
+        ValueType matches(kArrayType);
+        matches.PushBack(index1, GetStateAllocator());
+        matches.PushBack(index2, GetStateAllocator());
+        currentError_.SetObject();
+        currentError_.AddMember(GetMatchesString(), matches, GetStateAllocator());
+        AddCurrentError(kValidateErrorOneOfMatch);
+    }
+    void Disallowed() {
+        currentError_.SetObject();
+        AddCurrentError(kValidateErrorNot);
+    }
+    void DisallowedWhenWriting() {
+        currentError_.SetObject();
+        AddCurrentError(kValidateErrorReadOnly);
+    }
+    void DisallowedWhenReading() {
+        currentError_.SetObject();
+        AddCurrentError(kValidateErrorWriteOnly);
+    }
+
+#define RAPIDJSON_STRING_(name, ...) \
+    static const StringRefType& Get##name##String() {\
+        static const Ch s[] = { __VA_ARGS__, '\0' };\
+        static const StringRefType v(s, static_cast<SizeType>(sizeof(s) / sizeof(Ch) - 1)); \
+        return v;\
+    }
+
+    RAPIDJSON_STRING_(InstanceRef, 'i', 'n', 's', 't', 'a', 'n', 'c', 'e', 'R', 'e', 'f')
+    RAPIDJSON_STRING_(SchemaRef, 's', 'c', 'h', 'e', 'm', 'a', 'R', 'e', 'f')
+    RAPIDJSON_STRING_(Expected, 'e', 'x', 'p', 'e', 'c', 't', 'e', 'd')
+    RAPIDJSON_STRING_(Actual, 'a', 'c', 't', 'u', 'a', 'l')
+    RAPIDJSON_STRING_(Disallowed, 'd', 'i', 's', 'a', 'l', 'l', 'o', 'w', 'e', 'd')
+    RAPIDJSON_STRING_(Missing, 'm', 'i', 's', 's', 'i', 'n', 'g')
+    RAPIDJSON_STRING_(Errors, 'e', 'r', 'r', 'o', 'r', 's')
+    RAPIDJSON_STRING_(ErrorCode, 'e', 'r', 'r', 'o', 'r', 'C', 'o', 'd', 'e')
+    RAPIDJSON_STRING_(ErrorMessage, 'e', 'r', 'r', 'o', 'r', 'M', 'e', 's', 's', 'a', 'g', 'e')
+    RAPIDJSON_STRING_(Duplicates, 'd', 'u', 'p', 'l', 'i', 'c', 'a', 't', 'e', 's')
+    RAPIDJSON_STRING_(Matches, 'm', 'a', 't', 'c', 'h', 'e', 's')
+
+#undef RAPIDJSON_STRING_
+
+#define RAPIDJSON_SCHEMA_HANDLE_BEGIN_(method, arg1)\
+    if (!valid_) return false; \
+    if ((!BeginValue() && !GetContinueOnErrors()) || (!CurrentSchema().method arg1 && !GetContinueOnErrors())) {\
+        *documentStack_.template Push<Ch>() = '\0';\
+        documentStack_.template Pop<Ch>(1);\
+        RAPIDJSON_SCHEMA_PRINT(InvalidDocument, documentStack_.template Bottom<Ch>());\
+        valid_ = false;\
+        return valid_;\
+    }
+
+#define RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(method, arg2)\
+    for (Context* context = schemaStack_.template Bottom<Context>(); context != schemaStack_.template End<Context>(); context++) {\
+        if (context->hasher)\
+            static_cast<HasherType*>(context->hasher)->method arg2;\
+        if (context->validators)\
+            for (SizeType i_ = 0; i_ < context->validatorCount; i_++)\
+                static_cast<GenericSchemaValidator*>(context->validators[i_])->method arg2;\
+        if (context->patternPropertiesValidators)\
+            for (SizeType i_ = 0; i_ < context->patternPropertiesValidatorCount; i_++)\
+                static_cast<GenericSchemaValidator*>(context->patternPropertiesValidators[i_])->method arg2;\
+    }
+
+#define RAPIDJSON_SCHEMA_HANDLE_END_(method, arg2)\
+    valid_ = (EndValue() || GetContinueOnErrors()) && (!outputHandler_ || outputHandler_->method arg2);\
+    return valid_;
+
+#define RAPIDJSON_SCHEMA_HANDLE_VALUE_(method, arg1, arg2) \
+    RAPIDJSON_SCHEMA_HANDLE_BEGIN_   (method, arg1);\
+    RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(method, arg2);\
+    RAPIDJSON_SCHEMA_HANDLE_END_     (method, arg2)
+
+    bool Null()             { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Null,   (CurrentContext()), ( )); }
+    bool Bool(bool b)       { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Bool,   (CurrentContext(), b), (b)); }
+    bool Int(int i)         { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Int,    (CurrentContext(), i), (i)); }
+    bool Uint(unsigned u)   { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Uint,   (CurrentContext(), u), (u)); }
+    bool Int64(int64_t i)   { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Int64,  (CurrentContext(), i), (i)); }
+    bool Uint64(uint64_t u) { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Uint64, (CurrentContext(), u), (u)); }
+    bool Double(double d)   { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Double, (CurrentContext(), d), (d)); }
+    bool RawNumber(const Ch* str, SizeType length, bool copy)
+                                    { RAPIDJSON_SCHEMA_HANDLE_VALUE_(String, (CurrentContext(), str, length, copy), (str, length, copy)); }
+    bool String(const Ch* str, SizeType length, bool copy)
+                                    { RAPIDJSON_SCHEMA_HANDLE_VALUE_(String, (CurrentContext(), str, length, copy), (str, length, copy)); }
+
+    bool StartObject() {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::StartObject");
+        RAPIDJSON_SCHEMA_HANDLE_BEGIN_(StartObject, (CurrentContext()));
+        RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(StartObject, ());
+        valid_ = !outputHandler_ || outputHandler_->StartObject();
+        return valid_;
+    }
+    
+    bool Key(const Ch* str, SizeType len, bool copy) {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::Key", str);
+        if (!valid_) return false;
+        AppendToken(str, len);
+        if (!CurrentSchema().Key(CurrentContext(), str, len, copy) && !GetContinueOnErrors()) {
+            valid_ = false;
+            return valid_;
+        }
+        RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(Key, (str, len, copy));
+        valid_ = !outputHandler_ || outputHandler_->Key(str, len, copy);
+        return valid_;
+    }
+    
+    bool EndObject(SizeType memberCount) {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::EndObject");
+        if (!valid_) return false;
+        RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(EndObject, (memberCount));
+        if (!CurrentSchema().EndObject(CurrentContext(), memberCount) && !GetContinueOnErrors()) { 
+            valid_ = false; 
+            return valid_; 
+        }
+        RAPIDJSON_SCHEMA_HANDLE_END_(EndObject, (memberCount));
+    }
+
+    bool StartArray() {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::StartArray");
+        RAPIDJSON_SCHEMA_HANDLE_BEGIN_(StartArray, (CurrentContext()));
+        RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(StartArray, ());
+        valid_ = !outputHandler_ || outputHandler_->StartArray();
+        return valid_;
+    }
+    
+    bool EndArray(SizeType elementCount) {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::EndArray");
+        if (!valid_) return false;
+        RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(EndArray, (elementCount));
+        if (!CurrentSchema().EndArray(CurrentContext(), elementCount) && !GetContinueOnErrors()) {
+            valid_ = false;
+            return valid_;
+        }
+        RAPIDJSON_SCHEMA_HANDLE_END_(EndArray, (elementCount));
+    }
+
+#undef RAPIDJSON_SCHEMA_HANDLE_BEGIN_
+#undef RAPIDJSON_SCHEMA_HANDLE_PARALLEL_
+#undef RAPIDJSON_SCHEMA_HANDLE_VALUE_
+
+    // Implementation of ISchemaStateFactory<SchemaType>
+    virtual ISchemaValidator* CreateSchemaValidator(const SchemaType& root, const bool inheritContinueOnErrors) {
+        *documentStack_.template Push<Ch>() = '\0';
+        documentStack_.template Pop<Ch>(1);
+        ISchemaValidator* sv = new (GetStateAllocator().Malloc(sizeof(GenericSchemaValidator))) GenericSchemaValidator(*schemaDocument_, root, documentStack_.template Bottom<char>(), documentStack_.GetSize(),
+        depth_ + 1,
+        &GetStateAllocator());
+        sv->SetValidateFlags(inheritContinueOnErrors ? GetValidateFlags() : GetValidateFlags() & ~static_cast<unsigned>(kValidateContinueOnErrorFlag));
+        return sv;
+    }
+
+    virtual void DestroySchemaValidator(ISchemaValidator* validator) {
+        GenericSchemaValidator* v = static_cast<GenericSchemaValidator*>(validator);
+        v->~GenericSchemaValidator();
+        StateAllocator::Free(v);
+    }
+
+    virtual void* CreateHasher() {
+        return new (GetStateAllocator().Malloc(sizeof(HasherType))) HasherType(&GetStateAllocator());
+    }
+
+    virtual uint64_t GetHashCode(void* hasher) {
+        return static_cast<HasherType*>(hasher)->GetHashCode();
+    }
+
+    virtual void DestroryHasher(void* hasher) {
+        HasherType* h = static_cast<HasherType*>(hasher);
+        h->~HasherType();
+        StateAllocator::Free(h);
+    }
+
+    virtual void* MallocState(size_t size) {
+        return GetStateAllocator().Malloc(size);
+    }
+
+    virtual void FreeState(void* p) {
+        StateAllocator::Free(p);
+    }
+    // End of implementation of ISchemaStateFactory<SchemaType>
+
+private:
+    typedef typename SchemaType::Context Context;
+    typedef GenericValue<UTF8<>, StateAllocator> HashCodeArray;
+    typedef internal::Hasher<EncodingType, StateAllocator> HasherType;
+
+    GenericSchemaValidator( 
+        const SchemaDocumentType& schemaDocument,
+        const SchemaType& root,
+        const char* basePath, size_t basePathSize,
+        unsigned depth,
+        StateAllocator* allocator = 0,
+        size_t schemaStackCapacity = kDefaultSchemaStackCapacity,
+        size_t documentStackCapacity = kDefaultDocumentStackCapacity)
+        :
+        schemaDocument_(&schemaDocument),
+        root_(root),
+        stateAllocator_(allocator),
+        ownStateAllocator_(0),
+        schemaStack_(allocator, schemaStackCapacity),
+        documentStack_(allocator, documentStackCapacity),
+        outputHandler_(0),
+        error_(kObjectType),
+        currentError_(),
+        missingDependents_(),
+        valid_(true),
+        flags_(kValidateDefaultFlags),
+        depth_(depth)
+    {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::GenericSchemaValidator (internal)", basePath && basePathSize ? basePath : "");
+        if (basePath && basePathSize)
+            memcpy(documentStack_.template Push<char>(basePathSize), basePath, basePathSize);
+    }
+
+    StateAllocator& GetStateAllocator() {
+        if (!stateAllocator_)
+            stateAllocator_ = ownStateAllocator_ = RAPIDJSON_NEW(StateAllocator)();
+        return *stateAllocator_;
+    }
+
+    bool GetContinueOnErrors() const {
+        return flags_ & kValidateContinueOnErrorFlag;
+    }
+
+    bool BeginValue() {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::BeginValue");
+        if (schemaStack_.Empty())
+            PushSchema(root_);
+        else {
+            if (CurrentContext().inArray)
+                internal::TokenHelper<internal::Stack<StateAllocator>, Ch>::AppendIndexToken(documentStack_, CurrentContext().arrayElementIndex);
+
+            if (!CurrentSchema().BeginValue(CurrentContext()) && !GetContinueOnErrors())
+                return false;
+
+            SizeType count = CurrentContext().patternPropertiesSchemaCount;
+            const SchemaType** sa = CurrentContext().patternPropertiesSchemas;
+            typename Context::PatternValidatorType patternValidatorType = CurrentContext().valuePatternValidatorType;
+            bool valueUniqueness = CurrentContext().valueUniqueness;
+            RAPIDJSON_ASSERT(CurrentContext().valueSchema);
+            PushSchema(*CurrentContext().valueSchema);
+
+            if (count > 0) {
+                CurrentContext().objectPatternValidatorType = patternValidatorType;
+                ISchemaValidator**& va = CurrentContext().patternPropertiesValidators;
+                SizeType& validatorCount = CurrentContext().patternPropertiesValidatorCount;
+                va = static_cast<ISchemaValidator**>(MallocState(sizeof(ISchemaValidator*) * count));
+                std::memset(va, 0, sizeof(ISchemaValidator*) * count);
+                for (SizeType i = 0; i < count; i++)
+                    va[validatorCount++] = CreateSchemaValidator(*sa[i], true);  // Inherit continueOnError
+            }
+
+            CurrentContext().arrayUniqueness = valueUniqueness;
+        }
+        return true;
+    }
+
+    bool EndValue() {
+        RAPIDJSON_SCHEMA_PRINT(Method, "GenericSchemaValidator::EndValue");
+        if (!CurrentSchema().EndValue(CurrentContext()) && !GetContinueOnErrors())
+            return false;
+
+        GenericStringBuffer<EncodingType> sb;
+        schemaDocument_->GetPointer(&CurrentSchema()).StringifyUriFragment(sb);
+        *documentStack_.template Push<Ch>() = '\0';
+        documentStack_.template Pop<Ch>(1);
+        RAPIDJSON_SCHEMA_PRINT(ValidatorPointers, sb.GetString(), documentStack_.template Bottom<Ch>(), depth_);
+        void* hasher = CurrentContext().hasher;
+        uint64_t h = hasher && CurrentContext().arrayUniqueness ? static_cast<HasherType*>(hasher)->GetHashCode() : 0;
+        
+        PopSchema();
+
+        if (!schemaStack_.Empty()) {
+            Context& context = CurrentContext();
+            // Only check uniqueness if there is a hasher
+            if (hasher && context.valueUniqueness) {
+                HashCodeArray* a = static_cast<HashCodeArray*>(context.arrayElementHashCodes);
+                if (!a)
+                    CurrentContext().arrayElementHashCodes = a = new (GetStateAllocator().Malloc(sizeof(HashCodeArray))) HashCodeArray(kArrayType);
+                for (typename HashCodeArray::ConstValueIterator itr = a->Begin(); itr != a->End(); ++itr)
+                    if (itr->GetUint64() == h) {
+                        DuplicateItems(static_cast<SizeType>(itr - a->Begin()), a->Size());
+                        // Cleanup before returning if continuing
+                        if (GetContinueOnErrors()) {
+                            a->PushBack(h, GetStateAllocator());
+                            while (!documentStack_.Empty() && *documentStack_.template Pop<Ch>(1) != '/');
+                        }
+                        RAPIDJSON_INVALID_KEYWORD_RETURN(kValidateErrorUniqueItems);
+                    }
+                a->PushBack(h, GetStateAllocator());
+            }
+        }
+
+        // Remove the last token of document pointer
+        while (!documentStack_.Empty() && *documentStack_.template Pop<Ch>(1) != '/')
+            ;
+
+        return true;
+    }
+
+    void AppendToken(const Ch* str, SizeType len) {
+        documentStack_.template Reserve<Ch>(1 + len * 2); // worst case all characters are escaped as two characters
+        *documentStack_.template PushUnsafe<Ch>() = '/';
+        for (SizeType i = 0; i < len; i++) {
+            if (str[i] == '~') {
+                *documentStack_.template PushUnsafe<Ch>() = '~';
+                *documentStack_.template PushUnsafe<Ch>() = '0';
+            }
+            else if (str[i] == '/') {
+                *documentStack_.template PushUnsafe<Ch>() = '~';
+                *documentStack_.template PushUnsafe<Ch>() = '1';
+            }
+            else
+                *documentStack_.template PushUnsafe<Ch>() = str[i];
+        }
+    }
+
+    RAPIDJSON_FORCEINLINE void PushSchema(const SchemaType& schema) { new (schemaStack_.template Push<Context>()) Context(*this, *this, &schema, flags_); }
+    
+    RAPIDJSON_FORCEINLINE void PopSchema() {
+        Context* c = schemaStack_.template Pop<Context>(1);
+        if (HashCodeArray* a = static_cast<HashCodeArray*>(c->arrayElementHashCodes)) {
+            a->~HashCodeArray();
+            StateAllocator::Free(a);
+        }
+        c->~Context();
+    }
+
+    void AddErrorInstanceLocation(ValueType& result, bool parent) {
+        GenericStringBuffer<EncodingType> sb;
+        PointerType instancePointer = GetInvalidDocumentPointer();
+        ((parent && instancePointer.GetTokenCount() > 0)
+         ? PointerType(instancePointer.GetTokens(), instancePointer.GetTokenCount() - 1)
+         : instancePointer).StringifyUriFragment(sb);
+        ValueType instanceRef(sb.GetString(), static_cast<SizeType>(sb.GetSize() / sizeof(Ch)),
+                              GetStateAllocator());
+        result.AddMember(GetInstanceRefString(), instanceRef, GetStateAllocator());
+    }
+
+    void AddErrorSchemaLocation(ValueType& result, PointerType schema = PointerType()) {
+        GenericStringBuffer<EncodingType> sb;
+        SizeType len = CurrentSchema().GetURI().GetStringLength();
+        if (len) memcpy(sb.Push(len), CurrentSchema().GetURI().GetString(), len * sizeof(Ch));
+        if (schema.GetTokenCount()) schema.StringifyUriFragment(sb);
+        else GetInvalidSchemaPointer().StringifyUriFragment(sb);
+        ValueType schemaRef(sb.GetString(), static_cast<SizeType>(sb.GetSize() / sizeof(Ch)),
+            GetStateAllocator());
+        result.AddMember(GetSchemaRefString(), schemaRef, GetStateAllocator());
+    }
+
+    void AddErrorCode(ValueType& result, const ValidateErrorCode code) {
+        result.AddMember(GetErrorCodeString(), code, GetStateAllocator());
+    }
+
+    void AddError(ValueType& keyword, ValueType& error) {
+        typename ValueType::MemberIterator member = error_.FindMember(keyword);
+        if (member == error_.MemberEnd())
+            error_.AddMember(keyword, error, GetStateAllocator());
+        else {
+            if (member->value.IsObject()) {
+                ValueType errors(kArrayType);
+                errors.PushBack(member->value, GetStateAllocator());
+                member->value = errors;
+            }
+            member->value.PushBack(error, GetStateAllocator());
+        }
+    }
+
+    void AddCurrentError(const ValidateErrorCode code, bool parent = false) {
+        AddErrorCode(currentError_, code);
+        AddErrorInstanceLocation(currentError_, parent);
+        AddErrorSchemaLocation(currentError_);
+        AddError(ValueType(SchemaType::GetValidateErrorKeyword(code), GetStateAllocator(), false).Move(), currentError_);
+    }
+
+    void MergeError(ValueType& other) {
+        for (typename ValueType::MemberIterator it = other.MemberBegin(), end = other.MemberEnd(); it != end; ++it) {
+            AddError(it->name, it->value);
+        }
+    }
+
+    void AddNumberError(const ValidateErrorCode code, ValueType& actual, const SValue& expected,
+        const typename SchemaType::ValueType& (*exclusive)() = 0) {
+        currentError_.SetObject();
+        currentError_.AddMember(GetActualString(), actual, GetStateAllocator());
+        currentError_.AddMember(GetExpectedString(), ValueType(expected, GetStateAllocator()).Move(), GetStateAllocator());
+        if (exclusive)
+            currentError_.AddMember(ValueType(exclusive(), GetStateAllocator()).Move(), true, GetStateAllocator());
+        AddCurrentError(code);
+    }
+
+    void AddErrorArray(const ValidateErrorCode code,
+        ISchemaValidator** subvalidators, SizeType count) {
+        ValueType errors(kArrayType);
+        for (SizeType i = 0; i < count; ++i)
+            errors.PushBack(static_cast<GenericSchemaValidator*>(subvalidators[i])->GetError(), GetStateAllocator());
+        currentError_.SetObject();
+        currentError_.AddMember(GetErrorsString(), errors, GetStateAllocator());
+        AddCurrentError(code);
+    }
+
+    const SchemaType& CurrentSchema() const { return *schemaStack_.template Top<Context>()->schema; }
+    Context& CurrentContext() { return *schemaStack_.template Top<Context>(); }
+    const Context& CurrentContext() const { return *schemaStack_.template Top<Context>(); }
+
+    static const size_t kDefaultSchemaStackCapacity = 1024;
+    static const size_t kDefaultDocumentStackCapacity = 256;
+    const SchemaDocumentType* schemaDocument_;
+    const SchemaType& root_;
+    StateAllocator* stateAllocator_;
+    StateAllocator* ownStateAllocator_;
+    internal::Stack<StateAllocator> schemaStack_;    //!< stack to store the current path of schema (BaseSchemaType *)
+    internal::Stack<StateAllocator> documentStack_;  //!< stack to store the current path of validating document (Ch)
+    OutputHandler* outputHandler_;
+    ValueType error_;
+    ValueType currentError_;
+    ValueType missingDependents_;
+    bool valid_;
+    unsigned flags_;
+    unsigned depth_;
+};
+
+typedef GenericSchemaValidator<SchemaDocument> SchemaValidator;
+
+///////////////////////////////////////////////////////////////////////////////
+// SchemaValidatingReader
+
+//! A helper class for parsing with validation.
+/*!
+    This helper class is a functor, designed as a parameter of \ref GenericDocument::Populate().
+
+    \tparam parseFlags Combination of \ref ParseFlag.
+    \tparam InputStream Type of input stream, implementing Stream concept.
+    \tparam SourceEncoding Encoding of the input stream.
+    \tparam SchemaDocumentType Type of schema document.
+    \tparam StackAllocator Allocator type for stack.
+*/
+template <
+    unsigned parseFlags,
+    typename InputStream,
+    typename SourceEncoding,
+    typename SchemaDocumentType = SchemaDocument,
+    typename StackAllocator = CrtAllocator>
+class SchemaValidatingReader {
+public:
+    typedef typename SchemaDocumentType::PointerType PointerType;
+    typedef typename InputStream::Ch Ch;
+    typedef GenericValue<SourceEncoding, StackAllocator> ValueType;
+
+    //! Constructor
+    /*!
+        \param is Input stream.
+        \param sd Schema document.
+    */
+    SchemaValidatingReader(InputStream& is, const SchemaDocumentType& sd) : is_(is), sd_(sd), invalidSchemaKeyword_(), invalidSchemaCode_(kValidateErrorNone), error_(kObjectType), isValid_(true) {}
+
+    template <typename Handler>
+    bool operator()(Handler& handler) {
+        GenericReader<SourceEncoding, typename SchemaDocumentType::EncodingType, StackAllocator> reader;
+        GenericSchemaValidator<SchemaDocumentType, Handler> validator(sd_, handler);
+        parseResult_ = reader.template Parse<parseFlags>(is_, validator);
+
+        isValid_ = validator.IsValid();
+        if (isValid_) {
+            invalidSchemaPointer_ = PointerType();
+            invalidSchemaKeyword_ = 0;
+            invalidDocumentPointer_ = PointerType();
+            error_.SetObject();
+        }
+        else {
+            invalidSchemaPointer_ = validator.GetInvalidSchemaPointer();
+            invalidSchemaKeyword_ = validator.GetInvalidSchemaKeyword();
+            invalidSchemaCode_ = validator.GetInvalidSchemaCode();
+            invalidDocumentPointer_ = validator.GetInvalidDocumentPointer();
+            error_.CopyFrom(validator.GetError(), allocator_);
+        }
+
+        return parseResult_;
+    }
+
+    const ParseResult& GetParseResult() const { return parseResult_; }
+    bool IsValid() const { return isValid_; }
+    const PointerType& GetInvalidSchemaPointer() const { return invalidSchemaPointer_; }
+    const Ch* GetInvalidSchemaKeyword() const { return invalidSchemaKeyword_; }
+    const PointerType& GetInvalidDocumentPointer() const { return invalidDocumentPointer_; }
+    const ValueType& GetError() const { return error_; }
+    ValidateErrorCode GetInvalidSchemaCode() const { return invalidSchemaCode_; }
+
+private:
+    InputStream& is_;
+    const SchemaDocumentType& sd_;
+
+    ParseResult parseResult_;
+    PointerType invalidSchemaPointer_;
+    const Ch* invalidSchemaKeyword_;
+    PointerType invalidDocumentPointer_;
+    ValidateErrorCode invalidSchemaCode_;
+    StackAllocator allocator_;
+    ValueType error_;
+    bool isValid_;
+};
+
+RAPIDJSON_NAMESPACE_END
+RAPIDJSON_DIAG_POP
+
+#endif // RAPIDJSON_SCHEMA_H_
diff --git a/include/rapidjson/stream.h b/include/rapidjson/stream.h
new file mode 100644
index 0000000000..1fd70915c5
--- /dev/null
+++ b/include/rapidjson/stream.h
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rapidjson.h"
+
+#ifndef RAPIDJSON_STREAM_H_
+#define RAPIDJSON_STREAM_H_
+
+#include "encodings.h"
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+//  Stream
+
+/*! \class rapidjson::Stream
+    \brief Concept for reading and writing characters.
+
+    For read-only stream, no need to implement PutBegin(), Put(), Flush() and PutEnd().
+
+    For write-only stream, only need to implement Put() and Flush().
+
+\code
+concept Stream {
+    typename Ch;    //!< Character type of the stream.
+
+    //! Read the current character from stream without moving the read cursor.
+    Ch Peek() const;
+
+    //! Read the current character from stream and moving the read cursor to next character.
+    Ch Take();
+
+    //! Get the current read cursor.
+    //! \return Number of characters read from start.
+    size_t Tell();
+
+    //! Begin writing operation at the current read pointer.
+    //! \return The begin writer pointer.
+    Ch* PutBegin();
+
+    //! Write a character.
+    void Put(Ch c);
+
+    //! Flush the buffer.
+    void Flush();
+
+    //! End the writing operation.
+    //! \param begin The begin write pointer returned by PutBegin().
+    //! \return Number of characters written.
+    size_t PutEnd(Ch* begin);
+}
+\endcode
+*/
+
+//! Provides additional information for stream.
+/*!
+    By using traits pattern, this type provides a default configuration for stream.
+    For custom stream, this type can be specialized for other configuration.
+    See TEST(Reader, CustomStringStream) in readertest.cpp for example.
+*/
+template<typename Stream>
+struct StreamTraits {
+    //! Whether to make local copy of stream for optimization during parsing.
+    /*!
+        By default, for safety, streams do not use local copy optimization.
+        Stream that can be copied fast should specialize this, like StreamTraits<StringStream>.
+    */
+    enum { copyOptimization = 0 };
+};
+
+//! Reserve n characters for writing to a stream.
+template<typename Stream>
+inline void PutReserve(Stream& stream, size_t count) {
+    (void)stream;
+    (void)count;
+}
+
+//! Write character to a stream, presuming buffer is reserved.
+template<typename Stream>
+inline void PutUnsafe(Stream& stream, typename Stream::Ch c) {
+    stream.Put(c);
+}
+
+//! Put N copies of a character to a stream.
+template<typename Stream, typename Ch>
+inline void PutN(Stream& stream, Ch c, size_t n) {
+    PutReserve(stream, n);
+    for (size_t i = 0; i < n; i++)
+        PutUnsafe(stream, c);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericStreamWrapper
+
+//! A Stream Wrapper
+/*! \tThis string stream is a wrapper for any stream by just forwarding any
+    \treceived message to the origin stream.
+    \note implements Stream concept
+*/
+
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(4702)  // unreachable code
+RAPIDJSON_DIAG_OFF(4512)  // assignment operator could not be generated
+#endif
+
+template <typename InputStream, typename Encoding = UTF8<> >
+class GenericStreamWrapper {
+public:
+    typedef typename Encoding::Ch Ch;
+    GenericStreamWrapper(InputStream& is): is_(is) {}
+
+    Ch Peek() const { return is_.Peek(); }
+    Ch Take() { return is_.Take(); }
+    size_t Tell() { return is_.Tell(); }
+    Ch* PutBegin() { return is_.PutBegin(); }
+    void Put(Ch ch) { is_.Put(ch); }
+    void Flush() { is_.Flush(); }
+    size_t PutEnd(Ch* ch) { return is_.PutEnd(ch); }
+
+    // wrapper for MemoryStream
+    const Ch* Peek4() const { return is_.Peek4(); }
+
+    // wrapper for AutoUTFInputStream
+    UTFType GetType() const { return is_.GetType(); }
+    bool HasBOM() const { return is_.HasBOM(); }
+
+protected:
+    InputStream& is_;
+};
+
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+RAPIDJSON_DIAG_POP
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// StringStream
+
+//! Read-only string stream.
+/*! \note implements Stream concept
+*/
+template <typename Encoding>
+struct GenericStringStream {
+    typedef typename Encoding::Ch Ch;
+
+    GenericStringStream(const Ch *src) : src_(src), head_(src) {}
+
+    Ch Peek() const { return *src_; }
+    Ch Take() { return *src_++; }
+    size_t Tell() const { return static_cast<size_t>(src_ - head_); }
+
+    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
+    void Put(Ch) { RAPIDJSON_ASSERT(false); }
+    void Flush() { RAPIDJSON_ASSERT(false); }
+    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
+
+    const Ch* src_;     //!< Current read position.
+    const Ch* head_;    //!< Original head of the string.
+};
+
+template <typename Encoding>
+struct StreamTraits<GenericStringStream<Encoding> > {
+    enum { copyOptimization = 1 };
+};
+
+//! String stream with UTF8 encoding.
+typedef GenericStringStream<UTF8<> > StringStream;
+
+///////////////////////////////////////////////////////////////////////////////
+// InsituStringStream
+
+//! A read-write string stream.
+/*! This string stream is particularly designed for in-situ parsing.
+    \note implements Stream concept
+*/
+template <typename Encoding>
+struct GenericInsituStringStream {
+    typedef typename Encoding::Ch Ch;
+
+    GenericInsituStringStream(Ch *src) : src_(src), dst_(0), head_(src) {}
+
+    // Read
+    Ch Peek() { return *src_; }
+    Ch Take() { return *src_++; }
+    size_t Tell() { return static_cast<size_t>(src_ - head_); }
+
+    // Write
+    void Put(Ch c) { RAPIDJSON_ASSERT(dst_ != 0); *dst_++ = c; }
+
+    Ch* PutBegin() { return dst_ = src_; }
+    size_t PutEnd(Ch* begin) { return static_cast<size_t>(dst_ - begin); }
+    void Flush() {}
+
+    Ch* Push(size_t count) { Ch* begin = dst_; dst_ += count; return begin; }
+    void Pop(size_t count) { dst_ -= count; }
+
+    Ch* src_;
+    Ch* dst_;
+    Ch* head_;
+};
+
+template <typename Encoding>
+struct StreamTraits<GenericInsituStringStream<Encoding> > {
+    enum { copyOptimization = 1 };
+};
+
+//! Insitu string stream with UTF8 encoding.
+typedef GenericInsituStringStream<UTF8<> > InsituStringStream;
+
+RAPIDJSON_NAMESPACE_END
+
+#endif // RAPIDJSON_STREAM_H_
diff --git a/include/rapidjson/stringbuffer.h b/include/rapidjson/stringbuffer.h
new file mode 100644
index 0000000000..82ad3ca6bb
--- /dev/null
+++ b/include/rapidjson/stringbuffer.h
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_STRINGBUFFER_H_
+#define RAPIDJSON_STRINGBUFFER_H_
+
+#include "stream.h"
+#include "internal/stack.h"
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+#include <utility> // std::move
+#endif
+
+#include "internal/stack.h"
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(c++98-compat)
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+//! Represents an in-memory output stream.
+/*!
+    \tparam Encoding Encoding of the stream.
+    \tparam Allocator type for allocating memory buffer.
+    \note implements Stream concept
+*/
+template <typename Encoding, typename Allocator = CrtAllocator>
+class GenericStringBuffer {
+public:
+    typedef typename Encoding::Ch Ch;
+
+    GenericStringBuffer(Allocator* allocator = 0, size_t capacity = kDefaultCapacity) : stack_(allocator, capacity) {}
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    GenericStringBuffer(GenericStringBuffer&& rhs) : stack_(std::move(rhs.stack_)) {}
+    GenericStringBuffer& operator=(GenericStringBuffer&& rhs) {
+        if (&rhs != this)
+            stack_ = std::move(rhs.stack_);
+        return *this;
+    }
+#endif
+
+    void Put(Ch c) { *stack_.template Push<Ch>() = c; }
+    void PutUnsafe(Ch c) { *stack_.template PushUnsafe<Ch>() = c; }
+    void Flush() {}
+
+    void Clear() { stack_.Clear(); }
+    void ShrinkToFit() {
+        // Push and pop a null terminator. This is safe.
+        *stack_.template Push<Ch>() = '\0';
+        stack_.ShrinkToFit();
+        stack_.template Pop<Ch>(1);
+    }
+
+    void Reserve(size_t count) { stack_.template Reserve<Ch>(count); }
+    Ch* Push(size_t count) { return stack_.template Push<Ch>(count); }
+    Ch* PushUnsafe(size_t count) { return stack_.template PushUnsafe<Ch>(count); }
+    void Pop(size_t count) { stack_.template Pop<Ch>(count); }
+
+    const Ch* GetString() const {
+        // Push and pop a null terminator. This is safe.
+        *stack_.template Push<Ch>() = '\0';
+        stack_.template Pop<Ch>(1);
+
+        return stack_.template Bottom<Ch>();
+    }
+
+    //! Get the size of string in bytes in the string buffer.
+    size_t GetSize() const { return stack_.GetSize(); }
+
+    //! Get the length of string in Ch in the string buffer.
+    size_t GetLength() const { return stack_.GetSize() / sizeof(Ch); }
+
+    static const size_t kDefaultCapacity = 256;
+    mutable internal::Stack<Allocator> stack_;
+
+private:
+    // Prohibit copy constructor & assignment operator.
+    GenericStringBuffer(const GenericStringBuffer&);
+    GenericStringBuffer& operator=(const GenericStringBuffer&);
+};
+
+//! String buffer with UTF8 encoding
+typedef GenericStringBuffer<UTF8<> > StringBuffer;
+
+template<typename Encoding, typename Allocator>
+inline void PutReserve(GenericStringBuffer<Encoding, Allocator>& stream, size_t count) {
+    stream.Reserve(count);
+}
+
+template<typename Encoding, typename Allocator>
+inline void PutUnsafe(GenericStringBuffer<Encoding, Allocator>& stream, typename Encoding::Ch c) {
+    stream.PutUnsafe(c);
+}
+
+//! Implement specialized version of PutN() with memset() for better performance.
+template<>
+inline void PutN(GenericStringBuffer<UTF8<> >& stream, char c, size_t n) {
+    std::memset(stream.stack_.Push<char>(n), c, n * sizeof(c));
+}
+
+RAPIDJSON_NAMESPACE_END
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_STRINGBUFFER_H_
diff --git a/include/rapidjson/uri.h b/include/rapidjson/uri.h
new file mode 100644
index 0000000000..f93e508a4f
--- /dev/null
+++ b/include/rapidjson/uri.h
@@ -0,0 +1,481 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+//
+// (C) Copyright IBM Corporation 2021
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_URI_H_
+#define RAPIDJSON_URI_H_
+
+#include "internal/strfunc.h"
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(c++98-compat)
+#elif defined(_MSC_VER)
+RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+// GenericUri
+
+template <typename ValueType, typename Allocator=CrtAllocator>
+class GenericUri {
+public:
+    typedef typename ValueType::Ch Ch;
+#if RAPIDJSON_HAS_STDSTRING
+    typedef std::basic_string<Ch> String;
+#endif
+
+    //! Constructors
+    GenericUri(Allocator* allocator = 0) : uri_(), base_(), scheme_(), auth_(), path_(), query_(), frag_(), allocator_(allocator), ownAllocator_() {
+    }
+
+    GenericUri(const Ch* uri, SizeType len, Allocator* allocator = 0) : uri_(), base_(), scheme_(), auth_(), path_(), query_(), frag_(), allocator_(allocator), ownAllocator_() {
+        Parse(uri, len);
+    }
+
+    GenericUri(const Ch* uri, Allocator* allocator = 0) : uri_(), base_(), scheme_(), auth_(), path_(), query_(), frag_(), allocator_(allocator), ownAllocator_() {
+        Parse(uri, internal::StrLen<Ch>(uri));
+    }
+
+    // Use with specializations of GenericValue
+    template<typename T> GenericUri(const T& uri, Allocator* allocator = 0) : uri_(), base_(), scheme_(), auth_(), path_(), query_(), frag_(), allocator_(allocator), ownAllocator_() {
+        const Ch* u = uri.template Get<const Ch*>(); // TypeHelper from document.h
+        Parse(u, internal::StrLen<Ch>(u));
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    GenericUri(const String& uri, Allocator* allocator = 0) : uri_(), base_(), scheme_(), auth_(), path_(), query_(), frag_(), allocator_(allocator), ownAllocator_() {
+        Parse(uri.c_str(), internal::StrLen<Ch>(uri.c_str()));
+    }
+#endif
+
+    //! Copy constructor
+    GenericUri(const GenericUri& rhs) : uri_(), base_(), scheme_(), auth_(), path_(), query_(), frag_(), allocator_(), ownAllocator_() {
+        *this = rhs;
+    }
+
+    //! Copy constructor
+    GenericUri(const GenericUri& rhs, Allocator* allocator) : uri_(), base_(), scheme_(), auth_(), path_(), query_(), frag_(), allocator_(allocator), ownAllocator_() {
+        *this = rhs;
+    }
+
+    //! Destructor.
+    ~GenericUri() {
+        Free();
+        RAPIDJSON_DELETE(ownAllocator_);
+    }
+
+    //! Assignment operator
+    GenericUri& operator=(const GenericUri& rhs) {
+        if (this != &rhs) {
+            // Do not delete ownAllocator
+            Free();
+            Allocate(rhs.GetStringLength());
+            auth_ = CopyPart(scheme_, rhs.scheme_, rhs.GetSchemeStringLength());
+            path_ = CopyPart(auth_, rhs.auth_, rhs.GetAuthStringLength());
+            query_ = CopyPart(path_, rhs.path_, rhs.GetPathStringLength());
+            frag_ = CopyPart(query_, rhs.query_, rhs.GetQueryStringLength());
+            base_ = CopyPart(frag_, rhs.frag_, rhs.GetFragStringLength());
+            uri_ = CopyPart(base_, rhs.base_, rhs.GetBaseStringLength());
+            CopyPart(uri_, rhs.uri_, rhs.GetStringLength());
+        }
+        return *this;
+    }
+
+    //! Getters
+    // Use with specializations of GenericValue
+    template<typename T> void Get(T& uri, Allocator& allocator) {
+        uri.template Set<const Ch*>(this->GetString(), allocator); // TypeHelper from document.h
+    }
+
+    const Ch* GetString() const { return uri_; }
+    SizeType GetStringLength() const { return uri_ == 0 ? 0 : internal::StrLen<Ch>(uri_); }
+    const Ch* GetBaseString() const { return base_; }
+    SizeType GetBaseStringLength() const { return base_ == 0 ? 0 : internal::StrLen<Ch>(base_); }
+    const Ch* GetSchemeString() const { return scheme_; }
+    SizeType GetSchemeStringLength() const { return scheme_ == 0 ? 0 : internal::StrLen<Ch>(scheme_); }
+    const Ch* GetAuthString() const { return auth_; }
+    SizeType GetAuthStringLength() const { return auth_ == 0 ? 0 : internal::StrLen<Ch>(auth_); }
+    const Ch* GetPathString() const { return path_; }
+    SizeType GetPathStringLength() const { return path_ == 0 ? 0 : internal::StrLen<Ch>(path_); }
+    const Ch* GetQueryString() const { return query_; }
+    SizeType GetQueryStringLength() const { return query_ == 0 ? 0 : internal::StrLen<Ch>(query_); }
+    const Ch* GetFragString() const { return frag_; }
+    SizeType GetFragStringLength() const { return frag_ == 0 ? 0 : internal::StrLen<Ch>(frag_); }
+
+#if RAPIDJSON_HAS_STDSTRING
+    static String Get(const GenericUri& uri) { return String(uri.GetString(), uri.GetStringLength()); }
+    static String GetBase(const GenericUri& uri) { return String(uri.GetBaseString(), uri.GetBaseStringLength()); }
+    static String GetScheme(const GenericUri& uri) { return String(uri.GetSchemeString(), uri.GetSchemeStringLength()); }
+    static String GetAuth(const GenericUri& uri) { return String(uri.GetAuthString(), uri.GetAuthStringLength()); }
+    static String GetPath(const GenericUri& uri) { return String(uri.GetPathString(), uri.GetPathStringLength()); }
+    static String GetQuery(const GenericUri& uri) { return String(uri.GetQueryString(), uri.GetQueryStringLength()); }
+    static String GetFrag(const GenericUri& uri) { return String(uri.GetFragString(), uri.GetFragStringLength()); }
+#endif
+
+    //! Equality operators
+    bool operator==(const GenericUri& rhs) const {
+        return Match(rhs, true);
+    }
+
+    bool operator!=(const GenericUri& rhs) const {
+        return !Match(rhs, true);
+    }
+
+    bool Match(const GenericUri& uri, bool full = true) const {
+        Ch* s1;
+        Ch* s2;
+        if (full) {
+            s1 = uri_;
+            s2 = uri.uri_;
+        } else {
+            s1 = base_;
+            s2 = uri.base_;
+        }
+        if (s1 == s2) return true;
+        if (s1 == 0 || s2 == 0) return false;
+        return internal::StrCmp<Ch>(s1, s2) == 0;
+    }
+
+    //! Resolve this URI against another (base) URI in accordance with URI resolution rules.
+    // See https://tools.ietf.org/html/rfc3986
+    // Use for resolving an id or $ref with an in-scope id.
+    // Returns a new GenericUri for the resolved URI.
+    GenericUri Resolve(const GenericUri& baseuri, Allocator* allocator = 0) {
+        GenericUri resuri;
+        resuri.allocator_ = allocator;
+        // Ensure enough space for combining paths
+        resuri.Allocate(GetStringLength() + baseuri.GetStringLength() + 1); // + 1 for joining slash
+
+        if (!(GetSchemeStringLength() == 0)) {
+            // Use all of this URI
+            resuri.auth_ = CopyPart(resuri.scheme_, scheme_, GetSchemeStringLength());
+            resuri.path_ = CopyPart(resuri.auth_, auth_, GetAuthStringLength());
+            resuri.query_ = CopyPart(resuri.path_, path_, GetPathStringLength());
+            resuri.frag_ = CopyPart(resuri.query_, query_, GetQueryStringLength());
+            resuri.RemoveDotSegments();
+        } else {
+            // Use the base scheme
+            resuri.auth_ = CopyPart(resuri.scheme_, baseuri.scheme_, baseuri.GetSchemeStringLength());
+            if (!(GetAuthStringLength() == 0)) {
+                // Use this auth, path, query
+                resuri.path_ = CopyPart(resuri.auth_, auth_, GetAuthStringLength());
+                resuri.query_ = CopyPart(resuri.path_, path_, GetPathStringLength());
+                resuri.frag_ = CopyPart(resuri.query_, query_, GetQueryStringLength());
+                resuri.RemoveDotSegments();
+            } else {
+                // Use the base auth
+                resuri.path_ = CopyPart(resuri.auth_, baseuri.auth_, baseuri.GetAuthStringLength());
+                if (GetPathStringLength() == 0) {
+                    // Use the base path
+                    resuri.query_ = CopyPart(resuri.path_, baseuri.path_, baseuri.GetPathStringLength());
+                    if (GetQueryStringLength() == 0) {
+                        // Use the base query
+                        resuri.frag_ = CopyPart(resuri.query_, baseuri.query_, baseuri.GetQueryStringLength());
+                    } else {
+                        // Use this query
+                        resuri.frag_ = CopyPart(resuri.query_, query_, GetQueryStringLength());
+                    }
+                } else {
+                    if (path_[0] == '/') {
+                        // Absolute path - use all of this path
+                        resuri.query_ = CopyPart(resuri.path_, path_, GetPathStringLength());
+                        resuri.RemoveDotSegments();
+                    } else {
+                        // Relative path - append this path to base path after base path's last slash
+                        size_t pos = 0;
+                        if (!(baseuri.GetAuthStringLength() == 0) && baseuri.GetPathStringLength() == 0) {
+                            resuri.path_[pos] = '/';
+                            pos++;
+                        }
+                        size_t lastslashpos = baseuri.GetPathStringLength();
+                        while (lastslashpos > 0) {
+                            if (baseuri.path_[lastslashpos - 1] == '/') break;
+                            lastslashpos--;
+                        }
+                        std::memcpy(&resuri.path_[pos], baseuri.path_, lastslashpos * sizeof(Ch));
+                        pos += lastslashpos;
+                        resuri.query_ = CopyPart(&resuri.path_[pos], path_, GetPathStringLength());
+                        resuri.RemoveDotSegments();
+                    }
+                    // Use this query
+                    resuri.frag_ = CopyPart(resuri.query_, query_, GetQueryStringLength());
+                }
+            }
+        }
+        // Always use this frag
+        resuri.base_ = CopyPart(resuri.frag_, frag_, GetFragStringLength());
+
+        // Re-constitute base_ and uri_
+        resuri.SetBase();
+        resuri.uri_ = resuri.base_ + resuri.GetBaseStringLength() + 1;
+        resuri.SetUri();
+        return resuri;
+    }
+
+    //! Get the allocator of this GenericUri.
+    Allocator& GetAllocator() { return *allocator_; }
+
+private:
+    // Allocate memory for a URI
+    // Returns total amount allocated
+    std::size_t Allocate(std::size_t len) {
+        // Create own allocator if user did not supply.
+        if (!allocator_)
+            ownAllocator_ =  allocator_ = RAPIDJSON_NEW(Allocator)();
+
+        // Allocate one block containing each part of the URI (5) plus base plus full URI, all null terminated.
+        // Order: scheme, auth, path, query, frag, base, uri
+        // Note need to set, increment, assign in 3 stages to avoid compiler warning bug.
+        size_t total = (3 * len + 7) * sizeof(Ch);
+        scheme_ = static_cast<Ch*>(allocator_->Malloc(total));
+        *scheme_ = '\0';
+        auth_ = scheme_;
+        auth_++;
+        *auth_ = '\0';
+        path_ = auth_;
+        path_++;
+        *path_ = '\0';
+        query_ = path_;
+        query_++;
+        *query_ = '\0';
+        frag_ = query_;
+        frag_++;
+        *frag_ = '\0';
+        base_ = frag_;
+        base_++;
+        *base_ = '\0';
+        uri_ = base_;
+        uri_++;
+        *uri_ = '\0';
+        return total;
+    }
+
+    // Free memory for a URI
+    void Free() {
+        if (scheme_) {
+            Allocator::Free(scheme_);
+            scheme_ = 0;
+        }
+    }
+
+    // Parse a URI into constituent scheme, authority, path, query, & fragment parts
+    // Supports URIs that match regex ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? as per
+    // https://tools.ietf.org/html/rfc3986
+    void Parse(const Ch* uri, std::size_t len) {
+        std::size_t start = 0, pos1 = 0, pos2 = 0;
+        Allocate(len);
+
+        // Look for scheme ([^:/?#]+):)?
+        if (start < len) {
+            while (pos1 < len) {
+                if (uri[pos1] == ':') break;
+                pos1++;
+            }
+            if (pos1 != len) {
+                while (pos2 < len) {
+                    if (uri[pos2] == '/') break;
+                    if (uri[pos2] == '?') break;
+                    if (uri[pos2] == '#') break;
+                    pos2++;
+                }
+                if (pos1 < pos2) {
+                    pos1++;
+                    std::memcpy(scheme_, &uri[start], pos1 * sizeof(Ch));
+                    scheme_[pos1] = '\0';
+                    start = pos1;
+                }
+            }
+        }
+        // Look for auth (//([^/?#]*))?
+        // Note need to set, increment, assign in 3 stages to avoid compiler warning bug.
+        auth_ = scheme_ + GetSchemeStringLength();
+        auth_++;
+        *auth_ = '\0';
+        if (start < len - 1 && uri[start] == '/' && uri[start + 1] == '/') {
+            pos2 = start + 2;
+            while (pos2 < len) {
+                if (uri[pos2] == '/') break;
+                if (uri[pos2] == '?') break;
+                if (uri[pos2] == '#') break;
+                pos2++;
+            }
+            std::memcpy(auth_, &uri[start], (pos2 - start) * sizeof(Ch));
+            auth_[pos2 - start] = '\0';
+            start = pos2;
+        }
+        // Look for path ([^?#]*)
+        // Note need to set, increment, assign in 3 stages to avoid compiler warning bug.
+        path_ = auth_ + GetAuthStringLength();
+        path_++;
+        *path_ = '\0';
+        if (start < len) {
+            pos2 = start;
+            while (pos2 < len) {
+                if (uri[pos2] == '?') break;
+                if (uri[pos2] == '#') break;
+                pos2++;
+            }
+            if (start != pos2) {
+                std::memcpy(path_, &uri[start], (pos2 - start) * sizeof(Ch));
+                path_[pos2 - start] = '\0';
+                if (path_[0] == '/')
+                    RemoveDotSegments();   // absolute path - normalize
+                start = pos2;
+            }
+        }
+        // Look for query (\?([^#]*))?
+        // Note need to set, increment, assign in 3 stages to avoid compiler warning bug.
+        query_ = path_ + GetPathStringLength();
+        query_++;
+        *query_ = '\0';
+        if (start < len && uri[start] == '?') {
+            pos2 = start + 1;
+            while (pos2 < len) {
+                if (uri[pos2] == '#') break;
+                pos2++;
+            }
+            if (start != pos2) {
+                std::memcpy(query_, &uri[start], (pos2 - start) * sizeof(Ch));
+                query_[pos2 - start] = '\0';
+                start = pos2;
+            }
+        }
+        // Look for fragment (#(.*))?
+        // Note need to set, increment, assign in 3 stages to avoid compiler warning bug.
+        frag_ = query_ + GetQueryStringLength();
+        frag_++;
+        *frag_ = '\0';
+        if (start < len && uri[start] == '#') {
+            std::memcpy(frag_, &uri[start], (len - start) * sizeof(Ch));
+            frag_[len - start] = '\0';
+        }
+
+        // Re-constitute base_ and uri_
+        base_ = frag_ + GetFragStringLength() + 1;
+        SetBase();
+        uri_ = base_ + GetBaseStringLength() + 1;
+        SetUri();
+    }
+
+    // Reconstitute base
+    void SetBase() {
+        Ch* next = base_;
+        std::memcpy(next, scheme_, GetSchemeStringLength() * sizeof(Ch));
+        next+= GetSchemeStringLength();
+        std::memcpy(next, auth_, GetAuthStringLength() * sizeof(Ch));
+        next+= GetAuthStringLength();
+        std::memcpy(next, path_, GetPathStringLength() * sizeof(Ch));
+        next+= GetPathStringLength();
+        std::memcpy(next, query_, GetQueryStringLength() * sizeof(Ch));
+        next+= GetQueryStringLength();
+        *next = '\0';
+    }
+
+    // Reconstitute uri
+    void SetUri() {
+        Ch* next = uri_;
+        std::memcpy(next, base_, GetBaseStringLength() * sizeof(Ch));
+        next+= GetBaseStringLength();
+        std::memcpy(next, frag_, GetFragStringLength() * sizeof(Ch));
+        next+= GetFragStringLength();
+        *next = '\0';
+    }
+
+    // Copy a part from one GenericUri to another
+    // Return the pointer to the next part to be copied to
+    Ch* CopyPart(Ch* to, Ch* from, std::size_t len) {
+        RAPIDJSON_ASSERT(to != 0);
+        RAPIDJSON_ASSERT(from != 0);
+        std::memcpy(to, from, len * sizeof(Ch));
+        to[len] = '\0';
+        Ch* next = to + len + 1;
+        return next;
+    }
+
+    // Remove . and .. segments from the path_ member.
+    // https://tools.ietf.org/html/rfc3986
+    // This is done in place as we are only removing segments.
+    void RemoveDotSegments() {
+        std::size_t pathlen = GetPathStringLength();
+        std::size_t pathpos = 0;  // Position in path_
+        std::size_t newpos = 0;   // Position in new path_
+
+        // Loop through each segment in original path_
+        while (pathpos < pathlen) {
+            // Get next segment, bounded by '/' or end
+            size_t slashpos = 0;
+            while ((pathpos + slashpos) < pathlen) {
+                if (path_[pathpos + slashpos] == '/') break;
+                slashpos++;
+            }
+            // Check for .. and . segments
+            if (slashpos == 2 && path_[pathpos] == '.' && path_[pathpos + 1] == '.') {
+                // Backup a .. segment in the new path_
+                // We expect to find a previously added slash at the end or nothing
+                RAPIDJSON_ASSERT(newpos == 0 || path_[newpos - 1] == '/');
+                size_t lastslashpos = newpos;
+                // Make sure we don't go beyond the start segment
+                if (lastslashpos > 1) {
+                    // Find the next to last slash and back up to it
+                    lastslashpos--;
+                    while (lastslashpos > 0) {
+                        if (path_[lastslashpos - 1] == '/') break;
+                        lastslashpos--;
+                    }
+                    // Set the new path_ position
+                    newpos = lastslashpos;
+                }
+            } else if (slashpos == 1 && path_[pathpos] == '.') {
+                // Discard . segment, leaves new path_ unchanged
+            } else {
+                // Move any other kind of segment to the new path_
+                RAPIDJSON_ASSERT(newpos <= pathpos);
+                std::memmove(&path_[newpos], &path_[pathpos], slashpos * sizeof(Ch));
+                newpos += slashpos;
+                // Add slash if not at end
+                if ((pathpos + slashpos) < pathlen) {
+                    path_[newpos] = '/';
+                    newpos++;
+                }
+            }
+            // Move to next segment
+            pathpos += slashpos + 1;
+        }
+        path_[newpos] = '\0';
+    }
+
+    Ch* uri_;    // Everything
+    Ch* base_;   // Everything except fragment
+    Ch* scheme_; // Includes the :
+    Ch* auth_;   // Includes the //
+    Ch* path_;   // Absolute if starts with /
+    Ch* query_;  // Includes the ?
+    Ch* frag_;   // Includes the #
+
+    Allocator* allocator_;      //!< The current allocator. It is either user-supplied or equal to ownAllocator_.
+    Allocator* ownAllocator_;   //!< Allocator owned by this Uri.
+};
+
+//! GenericUri for Value (UTF-8, default allocator).
+typedef GenericUri<Value> Uri;
+
+RAPIDJSON_NAMESPACE_END
+
+#if defined(__clang__)
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_URI_H_
diff --git a/include/rapidjson/writer.h b/include/rapidjson/writer.h
new file mode 100644
index 0000000000..632e02ce74
--- /dev/null
+++ b/include/rapidjson/writer.h
@@ -0,0 +1,721 @@
+// Tencent is pleased to support the open source community by making RapidJSON available.
+// 
+// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+//
+// Licensed under the MIT License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software distributed 
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef RAPIDJSON_WRITER_H_
+#define RAPIDJSON_WRITER_H_
+
+#include "stream.h"
+#include "internal/clzll.h"
+#include "internal/meta.h"
+#include "internal/stack.h"
+#include "internal/strfunc.h"
+#include "internal/dtoa.h"
+#include "internal/itoa.h"
+#include "stringbuffer.h"
+#include <new>      // placement new
+
+#if defined(RAPIDJSON_SIMD) && defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanForward)
+#endif
+#ifdef RAPIDJSON_SSE42
+#include <nmmintrin.h>
+#elif defined(RAPIDJSON_SSE2)
+#include <emmintrin.h>
+#elif defined(RAPIDJSON_NEON)
+#include <arm_neon.h>
+#endif
+
+#ifdef __clang__
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(padded)
+RAPIDJSON_DIAG_OFF(unreachable-code)
+RAPIDJSON_DIAG_OFF(c++98-compat)
+#elif defined(_MSC_VER)
+RAPIDJSON_DIAG_PUSH
+RAPIDJSON_DIAG_OFF(4127) // conditional expression is constant
+#endif
+
+RAPIDJSON_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+// WriteFlag
+
+/*! \def RAPIDJSON_WRITE_DEFAULT_FLAGS 
+    \ingroup RAPIDJSON_CONFIG
+    \brief User-defined kWriteDefaultFlags definition.
+
+    User can define this as any \c WriteFlag combinations.
+*/
+#ifndef RAPIDJSON_WRITE_DEFAULT_FLAGS
+#define RAPIDJSON_WRITE_DEFAULT_FLAGS kWriteNoFlags
+#endif
+
+//! Combination of writeFlags
+enum WriteFlag {
+    kWriteNoFlags = 0,              //!< No flags are set.
+    kWriteValidateEncodingFlag = 1, //!< Validate encoding of JSON strings.
+    kWriteNanAndInfFlag = 2,        //!< Allow writing of Infinity, -Infinity and NaN.
+    kWriteNanAndInfNullFlag = 4,    //!< Allow writing of Infinity, -Infinity and NaN as null.
+    kWriteDefaultFlags = RAPIDJSON_WRITE_DEFAULT_FLAGS  //!< Default write flags. Can be customized by defining RAPIDJSON_WRITE_DEFAULT_FLAGS
+};
+
+//! JSON writer
+/*! Writer implements the concept Handler.
+    It generates JSON text by events to an output os.
+
+    User may programmatically calls the functions of a writer to generate JSON text.
+
+    On the other side, a writer can also be passed to objects that generates events, 
+
+    for example Reader::Parse() and Document::Accept().
+
+    \tparam OutputStream Type of output stream.
+    \tparam SourceEncoding Encoding of source string.
+    \tparam TargetEncoding Encoding of output stream.
+    \tparam StackAllocator Type of allocator for allocating memory of stack.
+    \note implements Handler concept
+*/
+template<typename OutputStream, typename SourceEncoding = UTF8<>, typename TargetEncoding = UTF8<>, typename StackAllocator = CrtAllocator, unsigned writeFlags = kWriteDefaultFlags>
+class Writer {
+public:
+    typedef typename SourceEncoding::Ch Ch;
+
+    static const int kDefaultMaxDecimalPlaces = 324;
+
+    //! Constructor
+    /*! \param os Output stream.
+        \param stackAllocator User supplied allocator. If it is null, it will create a private one.
+        \param levelDepth Initial capacity of stack.
+    */
+    explicit
+    Writer(OutputStream& os, StackAllocator* stackAllocator = 0, size_t levelDepth = kDefaultLevelDepth) : 
+        os_(&os), level_stack_(stackAllocator, levelDepth * sizeof(Level)), maxDecimalPlaces_(kDefaultMaxDecimalPlaces), hasRoot_(false) {}
+
+    explicit
+    Writer(StackAllocator* allocator = 0, size_t levelDepth = kDefaultLevelDepth) :
+        os_(0), level_stack_(allocator, levelDepth * sizeof(Level)), maxDecimalPlaces_(kDefaultMaxDecimalPlaces), hasRoot_(false) {}
+
+#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
+    Writer(Writer&& rhs) :
+        os_(rhs.os_), level_stack_(std::move(rhs.level_stack_)), maxDecimalPlaces_(rhs.maxDecimalPlaces_), hasRoot_(rhs.hasRoot_) {
+        rhs.os_ = 0;
+    }
+#endif
+
+    //! Reset the writer with a new stream.
+    /*!
+        This function reset the writer with a new stream and default settings,
+        in order to make a Writer object reusable for output multiple JSONs.
+
+        \param os New output stream.
+        \code
+        Writer<OutputStream> writer(os1);
+        writer.StartObject();
+        // ...
+        writer.EndObject();
+
+        writer.Reset(os2);
+        writer.StartObject();
+        // ...
+        writer.EndObject();
+        \endcode
+    */
+    void Reset(OutputStream& os) {
+        os_ = &os;
+        hasRoot_ = false;
+        level_stack_.Clear();
+    }
+
+    //! Checks whether the output is a complete JSON.
+    /*!
+        A complete JSON has a complete root object or array.
+    */
+    bool IsComplete() const {
+        return hasRoot_ && level_stack_.Empty();
+    }
+
+    int GetMaxDecimalPlaces() const {
+        return maxDecimalPlaces_;
+    }
+
+    //! Sets the maximum number of decimal places for double output.
+    /*!
+        This setting truncates the output with specified number of decimal places.
+
+        For example, 
+
+        \code
+        writer.SetMaxDecimalPlaces(3);
+        writer.StartArray();
+        writer.Double(0.12345);                 // "0.123"
+        writer.Double(0.0001);                  // "0.0"
+        writer.Double(1.234567890123456e30);    // "1.234567890123456e30" (do not truncate significand for positive exponent)
+        writer.Double(1.23e-4);                 // "0.0"                  (do truncate significand for negative exponent)
+        writer.EndArray();
+        \endcode
+
+        The default setting does not truncate any decimal places. You can restore to this setting by calling
+        \code
+        writer.SetMaxDecimalPlaces(Writer::kDefaultMaxDecimalPlaces);
+        \endcode
+    */
+    void SetMaxDecimalPlaces(int maxDecimalPlaces) {
+        maxDecimalPlaces_ = maxDecimalPlaces;
+    }
+
+    /*!@name Implementation of Handler
+        \see Handler
+    */
+    //@{
+
+    bool Null()                 { Prefix(kNullType);   return EndValue(WriteNull()); }
+    bool Bool(bool b)           { Prefix(b ? kTrueType : kFalseType); return EndValue(WriteBool(b)); }
+    bool Int(int i)             { Prefix(kNumberType); return EndValue(WriteInt(i)); }
+    bool Uint(unsigned u)       { Prefix(kNumberType); return EndValue(WriteUint(u)); }
+    bool Int64(int64_t i64)     { Prefix(kNumberType); return EndValue(WriteInt64(i64)); }
+    bool Uint64(uint64_t u64)   { Prefix(kNumberType); return EndValue(WriteUint64(u64)); }
+
+    //! Writes the given \c double value to the stream
+    /*!
+        \param d The value to be written.
+        \return Whether it is succeed.
+    */
+    bool Double(double d)       { Prefix(kNumberType); return EndValue(WriteDouble(d)); }
+
+    bool RawNumber(const Ch* str, SizeType length, bool copy = false) {
+        RAPIDJSON_ASSERT(str != 0);
+        (void)copy;
+        Prefix(kNumberType);
+        return EndValue(WriteString(str, length));
+    }
+
+    bool String(const Ch* str, SizeType length, bool copy = false) {
+        RAPIDJSON_ASSERT(str != 0);
+        (void)copy;
+        Prefix(kStringType);
+        return EndValue(WriteString(str, length));
+    }
+
+#if RAPIDJSON_HAS_STDSTRING
+    bool String(const std::basic_string<Ch>& str) {
+        return String(str.data(), SizeType(str.size()));
+    }
+#endif
+
+    bool StartObject() {
+        Prefix(kObjectType);
+        new (level_stack_.template Push<Level>()) Level(false);
+        return WriteStartObject();
+    }
+
+    bool Key(const Ch* str, SizeType length, bool copy = false) { return String(str, length, copy); }
+
+#if RAPIDJSON_HAS_STDSTRING
+    bool Key(const std::basic_string<Ch>& str)
+    {
+      return Key(str.data(), SizeType(str.size()));
+    }
+#endif
+
+    bool EndObject(SizeType memberCount = 0) {
+        (void)memberCount;
+        RAPIDJSON_ASSERT(level_stack_.GetSize() >= sizeof(Level)); // not inside an Object
+        RAPIDJSON_ASSERT(!level_stack_.template Top<Level>()->inArray); // currently inside an Array, not Object
+        RAPIDJSON_ASSERT(0 == level_stack_.template Top<Level>()->valueCount % 2); // Object has a Key without a Value
+        level_stack_.template Pop<Level>(1);
+        return EndValue(WriteEndObject());
+    }
+
+    bool StartArray() {
+        Prefix(kArrayType);
+        new (level_stack_.template Push<Level>()) Level(true);
+        return WriteStartArray();
+    }
+
+    bool EndArray(SizeType elementCount = 0) {
+        (void)elementCount;
+        RAPIDJSON_ASSERT(level_stack_.GetSize() >= sizeof(Level));
+        RAPIDJSON_ASSERT(level_stack_.template Top<Level>()->inArray);
+        level_stack_.template Pop<Level>(1);
+        return EndValue(WriteEndArray());
+    }
+    //@}
+
+    /*! @name Convenience extensions */
+    //@{
+
+    //! Simpler but slower overload.
+    bool String(const Ch* const& str) { return String(str, internal::StrLen(str)); }
+    bool Key(const Ch* const& str) { return Key(str, internal::StrLen(str)); }
+    
+    //@}
+
+    //! Write a raw JSON value.
+    /*!
+        For user to write a stringified JSON as a value.
+
+        \param json A well-formed JSON value. It should not contain null character within [0, length - 1] range.
+        \param length Length of the json.
+        \param type Type of the root of json.
+    */
+    bool RawValue(const Ch* json, size_t length, Type type) {
+        RAPIDJSON_ASSERT(json != 0);
+        Prefix(type);
+        return EndValue(WriteRawValue(json, length));
+    }
+
+    //! Flush the output stream.
+    /*!
+        Allows the user to flush the output stream immediately.
+     */
+    void Flush() {
+        os_->Flush();
+    }
+
+    static const size_t kDefaultLevelDepth = 32;
+
+protected:
+    //! Information for each nested level
+    struct Level {
+        Level(bool inArray_) : valueCount(0), inArray(inArray_) {}
+        size_t valueCount;  //!< number of values in this level
+        bool inArray;       //!< true if in array, otherwise in object
+    };
+
+    bool WriteNull()  {
+        PutReserve(*os_, 4);
+        PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'u'); PutUnsafe(*os_, 'l'); PutUnsafe(*os_, 'l'); return true;
+    }
+
+    bool WriteBool(bool b)  {
+        if (b) {
+            PutReserve(*os_, 4);
+            PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'r'); PutUnsafe(*os_, 'u'); PutUnsafe(*os_, 'e');
+        }
+        else {
+            PutReserve(*os_, 5);
+            PutUnsafe(*os_, 'f'); PutUnsafe(*os_, 'a'); PutUnsafe(*os_, 'l'); PutUnsafe(*os_, 's'); PutUnsafe(*os_, 'e');
+        }
+        return true;
+    }
+
+    bool WriteInt(int i) {
+        char buffer[11];
+        const char* end = internal::i32toa(i, buffer);
+        PutReserve(*os_, static_cast<size_t>(end - buffer));
+        for (const char* p = buffer; p != end; ++p)
+            PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p));
+        return true;
+    }
+
+    bool WriteUint(unsigned u) {
+        char buffer[10];
+        const char* end = internal::u32toa(u, buffer);
+        PutReserve(*os_, static_cast<size_t>(end - buffer));
+        for (const char* p = buffer; p != end; ++p)
+            PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p));
+        return true;
+    }
+
+    bool WriteInt64(int64_t i64) {
+        char buffer[21];
+        const char* end = internal::i64toa(i64, buffer);
+        PutReserve(*os_, static_cast<size_t>(end - buffer));
+        for (const char* p = buffer; p != end; ++p)
+            PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p));
+        return true;
+    }
+
+    bool WriteUint64(uint64_t u64) {
+        char buffer[20];
+        char* end = internal::u64toa(u64, buffer);
+        PutReserve(*os_, static_cast<size_t>(end - buffer));
+        for (char* p = buffer; p != end; ++p)
+            PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p));
+        return true;
+    }
+
+    bool WriteDouble(double d) {
+        if (internal::Double(d).IsNanOrInf()) {
+            if (!(writeFlags & kWriteNanAndInfFlag) && !(writeFlags & kWriteNanAndInfNullFlag))
+                return false;
+            if (writeFlags & kWriteNanAndInfNullFlag) {
+                PutReserve(*os_, 4);
+                PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'u'); PutUnsafe(*os_, 'l'); PutUnsafe(*os_, 'l');
+                return true;
+            }
+            if (internal::Double(d).IsNan()) {
+                PutReserve(*os_, 3);
+                PutUnsafe(*os_, 'N'); PutUnsafe(*os_, 'a'); PutUnsafe(*os_, 'N');
+                return true;
+            }
+            if (internal::Double(d).Sign()) {
+                PutReserve(*os_, 9);
+                PutUnsafe(*os_, '-');
+            }
+            else
+                PutReserve(*os_, 8);
+            PutUnsafe(*os_, 'I'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'f');
+            PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'y');
+            return true;
+        }
+
+        char buffer[25];
+        char* end = internal::dtoa(d, buffer, maxDecimalPlaces_);
+        PutReserve(*os_, static_cast<size_t>(end - buffer));
+        for (char* p = buffer; p != end; ++p)
+            PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p));
+        return true;
+    }
+
+    bool WriteString(const Ch* str, SizeType length)  {
+        static const typename OutputStream::Ch hexDigits[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+        static const char escape[256] = {
+#define Z16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+            //0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
+            'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'b', 't', 'n', 'u', 'f', 'r', 'u', 'u', // 00
+            'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', // 10
+              0,   0, '"',   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 20
+            Z16, Z16,                                                                       // 30~4F
+              0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,'\\',   0,   0,   0, // 50
+            Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16                                // 60~FF
+#undef Z16
+        };
+
+        if (TargetEncoding::supportUnicode)
+            PutReserve(*os_, 2 + length * 6); // "\uxxxx..."
+        else
+            PutReserve(*os_, 2 + length * 12);  // "\uxxxx\uyyyy..."
+
+        PutUnsafe(*os_, '\"');
+        GenericStringStream<SourceEncoding> is(str);
+        while (ScanWriteUnescapedString(is, length)) {
+            const Ch c = is.Peek();
+            if (!TargetEncoding::supportUnicode && static_cast<unsigned>(c) >= 0x80) {
+                // Unicode escaping
+                unsigned codepoint;
+                if (RAPIDJSON_UNLIKELY(!SourceEncoding::Decode(is, &codepoint)))
+                    return false;
+                PutUnsafe(*os_, '\\');
+                PutUnsafe(*os_, 'u');
+                if (codepoint <= 0xD7FF || (codepoint >= 0xE000 && codepoint <= 0xFFFF)) {
+                    PutUnsafe(*os_, hexDigits[(codepoint >> 12) & 15]);
+                    PutUnsafe(*os_, hexDigits[(codepoint >>  8) & 15]);
+                    PutUnsafe(*os_, hexDigits[(codepoint >>  4) & 15]);
+                    PutUnsafe(*os_, hexDigits[(codepoint      ) & 15]);
+                }
+                else {
+                    RAPIDJSON_ASSERT(codepoint >= 0x010000 && codepoint <= 0x10FFFF);
+                    // Surrogate pair
+                    unsigned s = codepoint - 0x010000;
+                    unsigned lead = (s >> 10) + 0xD800;
+                    unsigned trail = (s & 0x3FF) + 0xDC00;
+                    PutUnsafe(*os_, hexDigits[(lead >> 12) & 15]);
+                    PutUnsafe(*os_, hexDigits[(lead >>  8) & 15]);
+                    PutUnsafe(*os_, hexDigits[(lead >>  4) & 15]);
+                    PutUnsafe(*os_, hexDigits[(lead      ) & 15]);
+                    PutUnsafe(*os_, '\\');
+                    PutUnsafe(*os_, 'u');
+                    PutUnsafe(*os_, hexDigits[(trail >> 12) & 15]);
+                    PutUnsafe(*os_, hexDigits[(trail >>  8) & 15]);
+                    PutUnsafe(*os_, hexDigits[(trail >>  4) & 15]);
+                    PutUnsafe(*os_, hexDigits[(trail      ) & 15]);                    
+                }
+            }
+            else if ((sizeof(Ch) == 1 || static_cast<unsigned>(c) < 256) && RAPIDJSON_UNLIKELY(escape[static_cast<unsigned char>(c)]))  {
+                is.Take();
+                PutUnsafe(*os_, '\\');
+                PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(escape[static_cast<unsigned char>(c)]));
+                if (escape[static_cast<unsigned char>(c)] == 'u') {
+                    PutUnsafe(*os_, '0');
+                    PutUnsafe(*os_, '0');
+                    PutUnsafe(*os_, hexDigits[static_cast<unsigned char>(c) >> 4]);
+                    PutUnsafe(*os_, hexDigits[static_cast<unsigned char>(c) & 0xF]);
+                }
+            }
+            else if (RAPIDJSON_UNLIKELY(!(writeFlags & kWriteValidateEncodingFlag ? 
+                Transcoder<SourceEncoding, TargetEncoding>::Validate(is, *os_) :
+                Transcoder<SourceEncoding, TargetEncoding>::TranscodeUnsafe(is, *os_))))
+                return false;
+        }
+        PutUnsafe(*os_, '\"');
+        return true;
+    }
+
+    bool ScanWriteUnescapedString(GenericStringStream<SourceEncoding>& is, size_t length) {
+        return RAPIDJSON_LIKELY(is.Tell() < length);
+    }
+
+    bool WriteStartObject() { os_->Put('{'); return true; }
+    bool WriteEndObject()   { os_->Put('}'); return true; }
+    bool WriteStartArray()  { os_->Put('['); return true; }
+    bool WriteEndArray()    { os_->Put(']'); return true; }
+
+    bool WriteRawValue(const Ch* json, size_t length) {
+        PutReserve(*os_, length);
+        GenericStringStream<SourceEncoding> is(json);
+        while (RAPIDJSON_LIKELY(is.Tell() < length)) {
+            RAPIDJSON_ASSERT(is.Peek() != '\0');
+            if (RAPIDJSON_UNLIKELY(!(writeFlags & kWriteValidateEncodingFlag ? 
+                Transcoder<SourceEncoding, TargetEncoding>::Validate(is, *os_) :
+                Transcoder<SourceEncoding, TargetEncoding>::TranscodeUnsafe(is, *os_))))
+                return false;
+        }
+        return true;
+    }
+
+    void Prefix(Type type) {
+        (void)type;
+        if (RAPIDJSON_LIKELY(level_stack_.GetSize() != 0)) { // this value is not at root
+            Level* level = level_stack_.template Top<Level>();
+            if (level->valueCount > 0) {
+                if (level->inArray) 
+                    os_->Put(','); // add comma if it is not the first element in array
+                else  // in object
+                    os_->Put((level->valueCount % 2 == 0) ? ',' : ':');
+            }
+            if (!level->inArray && level->valueCount % 2 == 0)
+                RAPIDJSON_ASSERT(type == kStringType);  // if it's in object, then even number should be a name
+            level->valueCount++;
+        }
+        else {
+            RAPIDJSON_ASSERT(!hasRoot_);    // Should only has one and only one root.
+            hasRoot_ = true;
+        }
+    }
+
+    // Flush the value if it is the top level one.
+    bool EndValue(bool ret) {
+        if (RAPIDJSON_UNLIKELY(level_stack_.Empty()))   // end of json text
+            Flush();
+        return ret;
+    }
+
+    OutputStream* os_;
+    internal::Stack<StackAllocator> level_stack_;
+    int maxDecimalPlaces_;
+    bool hasRoot_;
+
+private:
+    // Prohibit copy constructor & assignment operator.
+    Writer(const Writer&);
+    Writer& operator=(const Writer&);
+};
+
+// Full specialization for StringStream to prevent memory copying
+
+template<>
+inline bool Writer<StringBuffer>::WriteInt(int i) {
+    char *buffer = os_->Push(11);
+    const char* end = internal::i32toa(i, buffer);
+    os_->Pop(static_cast<size_t>(11 - (end - buffer)));
+    return true;
+}
+
+template<>
+inline bool Writer<StringBuffer>::WriteUint(unsigned u) {
+    char *buffer = os_->Push(10);
+    const char* end = internal::u32toa(u, buffer);
+    os_->Pop(static_cast<size_t>(10 - (end - buffer)));
+    return true;
+}
+
+template<>
+inline bool Writer<StringBuffer>::WriteInt64(int64_t i64) {
+    char *buffer = os_->Push(21);
+    const char* end = internal::i64toa(i64, buffer);
+    os_->Pop(static_cast<size_t>(21 - (end - buffer)));
+    return true;
+}
+
+template<>
+inline bool Writer<StringBuffer>::WriteUint64(uint64_t u) {
+    char *buffer = os_->Push(20);
+    const char* end = internal::u64toa(u, buffer);
+    os_->Pop(static_cast<size_t>(20 - (end - buffer)));
+    return true;
+}
+
+template<>
+inline bool Writer<StringBuffer>::WriteDouble(double d) {
+    if (internal::Double(d).IsNanOrInf()) {
+        // Note: This code path can only be reached if (RAPIDJSON_WRITE_DEFAULT_FLAGS & kWriteNanAndInfFlag).
+        if (!(kWriteDefaultFlags & kWriteNanAndInfFlag))
+            return false;
+        if (kWriteDefaultFlags & kWriteNanAndInfNullFlag) {
+            PutReserve(*os_, 4);
+            PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'u'); PutUnsafe(*os_, 'l'); PutUnsafe(*os_, 'l');
+            return true;
+        }
+        if (internal::Double(d).IsNan()) {
+            PutReserve(*os_, 3);
+            PutUnsafe(*os_, 'N'); PutUnsafe(*os_, 'a'); PutUnsafe(*os_, 'N');
+            return true;
+        }
+        if (internal::Double(d).Sign()) {
+            PutReserve(*os_, 9);
+            PutUnsafe(*os_, '-');
+        }
+        else
+            PutReserve(*os_, 8);
+        PutUnsafe(*os_, 'I'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'f');
+        PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'y');
+        return true;
+    }
+    
+    char *buffer = os_->Push(25);
+    char* end = internal::dtoa(d, buffer, maxDecimalPlaces_);
+    os_->Pop(static_cast<size_t>(25 - (end - buffer)));
+    return true;
+}
+
+#if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42)
+template<>
+inline bool Writer<StringBuffer>::ScanWriteUnescapedString(StringStream& is, size_t length) {
+    if (length < 16)
+        return RAPIDJSON_LIKELY(is.Tell() < length);
+
+    if (!RAPIDJSON_LIKELY(is.Tell() < length))
+        return false;
+
+    const char* p = is.src_;
+    const char* end = is.head_ + length;
+    const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+    const char* endAligned = reinterpret_cast<const char*>(reinterpret_cast<size_t>(end) & static_cast<size_t>(~15));
+    if (nextAligned > end)
+        return true;
+
+    while (p != nextAligned)
+        if (*p < 0x20 || *p == '\"' || *p == '\\') {
+            is.src_ = p;
+            return RAPIDJSON_LIKELY(is.Tell() < length);
+        }
+        else
+            os_->PutUnsafe(*p++);
+
+    // The rest of string using SIMD
+    static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' };
+    static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' };
+    static const char space[16]  = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F };
+    const __m128i dq = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&dquote[0]));
+    const __m128i bs = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&bslash[0]));
+    const __m128i sp = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&space[0]));
+
+    for (; p != endAligned; p += 16) {
+        const __m128i s = _mm_load_si128(reinterpret_cast<const __m128i *>(p));
+        const __m128i t1 = _mm_cmpeq_epi8(s, dq);
+        const __m128i t2 = _mm_cmpeq_epi8(s, bs);
+        const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F
+        const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3);
+        unsigned short r = static_cast<unsigned short>(_mm_movemask_epi8(x));
+        if (RAPIDJSON_UNLIKELY(r != 0)) {   // some of characters is escaped
+            SizeType len;
+#ifdef _MSC_VER         // Find the index of first escaped
+            unsigned long offset;
+            _BitScanForward(&offset, r);
+            len = offset;
+#else
+            len = static_cast<SizeType>(__builtin_ffs(r) - 1);
+#endif
+            char* q = reinterpret_cast<char*>(os_->PushUnsafe(len));
+            for (size_t i = 0; i < len; i++)
+                q[i] = p[i];
+
+            p += len;
+            break;
+        }
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(os_->PushUnsafe(16)), s);
+    }
+
+    is.src_ = p;
+    return RAPIDJSON_LIKELY(is.Tell() < length);
+}
+#elif defined(RAPIDJSON_NEON)
+template<>
+inline bool Writer<StringBuffer>::ScanWriteUnescapedString(StringStream& is, size_t length) {
+    if (length < 16)
+        return RAPIDJSON_LIKELY(is.Tell() < length);
+
+    if (!RAPIDJSON_LIKELY(is.Tell() < length))
+        return false;
+
+    const char* p = is.src_;
+    const char* end = is.head_ + length;
+    const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
+    const char* endAligned = reinterpret_cast<const char*>(reinterpret_cast<size_t>(end) & static_cast<size_t>(~15));
+    if (nextAligned > end)
+        return true;
+
+    while (p != nextAligned)
+        if (*p < 0x20 || *p == '\"' || *p == '\\') {
+            is.src_ = p;
+            return RAPIDJSON_LIKELY(is.Tell() < length);
+        }
+        else
+            os_->PutUnsafe(*p++);
+
+    // The rest of string using SIMD
+    const uint8x16_t s0 = vmovq_n_u8('"');
+    const uint8x16_t s1 = vmovq_n_u8('\\');
+    const uint8x16_t s2 = vmovq_n_u8('\b');
+    const uint8x16_t s3 = vmovq_n_u8(32);
+
+    for (; p != endAligned; p += 16) {
+        const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p));
+        uint8x16_t x = vceqq_u8(s, s0);
+        x = vorrq_u8(x, vceqq_u8(s, s1));
+        x = vorrq_u8(x, vceqq_u8(s, s2));
+        x = vorrq_u8(x, vcltq_u8(s, s3));
+
+        x = vrev64q_u8(x);                     // Rev in 64
+        uint64_t low = vgetq_lane_u64(vreinterpretq_u64_u8(x), 0);   // extract
+        uint64_t high = vgetq_lane_u64(vreinterpretq_u64_u8(x), 1);  // extract
+
+        SizeType len = 0;
+        bool escaped = false;
+        if (low == 0) {
+            if (high != 0) {
+                uint32_t lz = internal::clzll(high);
+                len = 8 + (lz >> 3);
+                escaped = true;
+            }
+        } else {
+            uint32_t lz = internal::clzll(low);
+            len = lz >> 3;
+            escaped = true;
+        }
+        if (RAPIDJSON_UNLIKELY(escaped)) {   // some of characters is escaped
+            char* q = reinterpret_cast<char*>(os_->PushUnsafe(len));
+            for (size_t i = 0; i < len; i++)
+                q[i] = p[i];
+
+            p += len;
+            break;
+        }
+        vst1q_u8(reinterpret_cast<uint8_t *>(os_->PushUnsafe(16)), s);
+    }
+
+    is.src_ = p;
+    return RAPIDJSON_LIKELY(is.Tell() < length);
+}
+#endif // RAPIDJSON_NEON
+
+RAPIDJSON_NAMESPACE_END
+
+#if defined(_MSC_VER) || defined(__clang__)
+RAPIDJSON_DIAG_POP
+#endif
+
+#endif // RAPIDJSON_RAPIDJSON_H_
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
index 3884902bbf..573571bc07 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -59,6 +59,7 @@ template <ck::index_t NDimSpatial,
           ck::index_t NumAElementwiseTensor                                         = 0,
           ck::index_t NumBElementwiseTensor                                         = 0,
           ck::index_t NumDElementwiseTensor                                         = 0,
+          typename ComputeDataType                                                  = InDataType,
           typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
 struct ReferenceConvFwd : public device::BaseOperator
 {
@@ -163,8 +164,18 @@ struct ReferenceConvFwd : public device::BaseOperator
                                                      k,
                                                      c,
                                                      x);
-                                v_acc +=
-                                    ck::type_convert<float>(v_in) * ck::type_convert<float>(v_wei);
+                                if constexpr(is_same_v<ComputeDataType, ck::tf32_t>)
+                                {
+                                    v_acc += ck::type_convert<float>(
+                                                 ck::type_convert<ComputeDataType>(v_in)) *
+                                             ck::type_convert<float>(
+                                                 ck::type_convert<ComputeDataType>(v_wei));
+                                }
+                                else
+                                {
+                                    v_acc += ck::type_convert<float>(v_in) *
+                                             ck::type_convert<float>(v_wei);
+                                }
                             }
                         }
                     }
@@ -238,8 +249,18 @@ struct ReferenceConvFwd : public device::BaseOperator
                                                          c,
                                                          y,
                                                          x);
-                                    v_acc += ck::type_convert<float>(v_in) *
-                                             ck::type_convert<float>(v_wei);
+                                    if constexpr(is_same_v<ComputeDataType, ck::tf32_t>)
+                                    {
+                                        v_acc += ck::type_convert<float>(
+                                                     ck::type_convert<ComputeDataType>(v_in)) *
+                                                 ck::type_convert<float>(
+                                                     ck::type_convert<ComputeDataType>(v_wei));
+                                    }
+                                    else
+                                    {
+                                        v_acc += ck::type_convert<float>(v_in) *
+                                                 ck::type_convert<float>(v_wei);
+                                    }
                                 }
                             }
                         }
@@ -327,8 +348,18 @@ struct ReferenceConvFwd : public device::BaseOperator
                                                              z,
                                                              y,
                                                              x);
-                                        v_acc += ck::type_convert<float>(v_in) *
-                                                 ck::type_convert<float>(v_wei);
+                                        if constexpr(is_same_v<ComputeDataType, ck::tf32_t>)
+                                        {
+                                            v_acc += ck::type_convert<float>(
+                                                         ck::type_convert<ComputeDataType>(v_in)) *
+                                                     ck::type_convert<float>(
+                                                         ck::type_convert<ComputeDataType>(v_wei));
+                                        }
+                                        else
+                                        {
+                                            v_acc += ck::type_convert<float>(v_in) *
+                                                     ck::type_convert<float>(v_wei);
+                                        }
                                     }
                                 }
                             }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index ed07e53e6d..8b9b973b2d 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -25,6 +25,12 @@ template <typename ADataType,
           typename ComputeTypeB = ComputeTypeA>
 struct ReferenceGemm : public device::BaseOperator
 {
+
+    using ElementDataTypeA =
+        ck::conditional_t<is_same_v<ComputeTypeA, ck::tf32_t>, float, ComputeTypeA>;
+    using ElementDataTypeB =
+        ck::conditional_t<is_same_v<ComputeTypeB, ck::tf32_t>, float, ComputeTypeB>;
+
     // Argument
     struct Argument : public device::BaseArgument
     {
@@ -63,8 +69,8 @@ struct ReferenceGemm : public device::BaseOperator
                 const int K = arg.a_m_k_.mDesc.GetLengths()[1];
 
                 AccDataType v_acc{0};
-                ComputeTypeA v_a{0};
-                ComputeTypeB v_b{0};
+                ElementDataTypeA v_a{0};
+                ElementDataTypeB v_b{0};
 
                 for(int k = 0; k < K; ++k)
                 {
@@ -77,16 +83,16 @@ struct ReferenceGemm : public device::BaseOperator
                         else
                             i4 = (i4x2 >> 4) & 0xf;
                         i4  = i4 - 8;
-                        v_a = type_convert<ComputeTypeA>(i4);
+                        v_a = type_convert<ElementDataTypeA>(i4);
                     }
                     else if constexpr(is_same_v<ADataType, f4x2_pk_t>)
                     {
                         // TODO: add support for ColMajor layout as well
                         if(k % 2 == 1)
-                            v_a = type_convert<ComputeTypeA>(
+                            v_a = type_convert<ElementDataTypeA>(
                                 f4_t(arg.a_m_k_(m, k).template unpack<>(Number<1>{})));
                         else
-                            v_a = type_convert<ComputeTypeA>(
+                            v_a = type_convert<ElementDataTypeA>(
                                 f4_t(arg.a_m_k_(m, k).template unpack<>(Number<0>{})));
                     }
                     else if constexpr(is_same_v<ADataType, f6x16_pk_t> ||
@@ -94,7 +100,7 @@ struct ReferenceGemm : public device::BaseOperator
                                       is_same_v<ADataType, f6x32_pk_t> ||
                                       is_same_v<ADataType, bf6x32_pk_t>)
                     {
-                        v_a = type_convert<ComputeTypeA>(
+                        v_a = type_convert<ElementDataTypeA>(
                             arg.a_m_k_(m, k).unpack(k % ADataType::packed_size));
                     }
                     else
@@ -111,16 +117,16 @@ struct ReferenceGemm : public device::BaseOperator
                         else
                             i4 = (i4x2 >> 4) & 0xf;
                         i4  = i4 - 8;
-                        v_b = type_convert<ComputeTypeB>(i4);
+                        v_b = type_convert<ElementDataTypeB>(i4);
                     }
                     else if constexpr(is_same_v<BDataType, f4x2_pk_t>)
                     {
                         // TODO: add support for RowMajor layout as well
                         if(k % 2 == 1)
-                            v_b = type_convert<ComputeTypeB>(
+                            v_b = type_convert<ElementDataTypeB>(
                                 f4_t(arg.b_k_n_(k, n).template unpack<>(Number<1>{})));
                         else
-                            v_b = type_convert<ComputeTypeB>(
+                            v_b = type_convert<ElementDataTypeB>(
                                 f4_t(arg.b_k_n_(k, n).template unpack<>(Number<0>{})));
                     }
                     else if constexpr(is_same_v<BDataType, f6x16_pk_t> ||
@@ -128,7 +134,7 @@ struct ReferenceGemm : public device::BaseOperator
                                       is_same_v<BDataType, f6x32_pk_t> ||
                                       is_same_v<BDataType, bf6x32_pk_t>)
                     {
-                        v_b = type_convert<ComputeTypeB>(
+                        v_b = type_convert<ElementDataTypeB>(
                             arg.b_k_n_(k, n).unpack(k % BDataType::packed_size));
                     }
                     else
@@ -136,8 +142,18 @@ struct ReferenceGemm : public device::BaseOperator
                         arg.b_element_op_(v_b, arg.b_k_n_(k, n));
                     }
 
-                    v_acc +=
-                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                    if constexpr(is_same_v<ComputeTypeA, ComputeTypeB> &&
+                                 is_same_v<ComputeTypeA, ck::tf32_t>)
+                    { // only for tf32 now
+                        v_acc +=
+                            ck::type_convert<AccDataType>(ck::type_convert<ComputeTypeA>(v_a)) *
+                            ck::type_convert<AccDataType>(ck::type_convert<ComputeTypeB>(v_b));
+                    }
+                    else
+                    {
+                        v_acc +=
+                            ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                    }
                 }
 
                 CDataType v_c{0};
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
index 59dfd76ede..d9c6cc5027 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
@@ -172,26 +172,26 @@ struct ReferenceMoeGemm : public device::BaseOperator
 
                     if constexpr(ActivationType == 1)
                     {
-                        v_c = v_c * arg.b_scale_e_n_(e, n) * arg.a_scale_t_(t);
+                        v_c = v_c * arg.b_scale_e_n_(e, n) * arg.a_scale_t_(t, 0);
                         if constexpr(is_same_v<BDataType, pk_i4_t>)
                         {
                             v_c_up *= 16;
                             v_c *= 16;
                         }
                         tensor_operation::element_wise::Silu{}(v_c, v_c);
-                        v_c_up = v_c_up * arg.b_scale_e_n_(e, n + full_n) * arg.a_scale_t_(t);
+                        v_c_up = v_c_up * arg.b_scale_e_n_(e, n + full_n) * arg.a_scale_t_(t, 0);
                         arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
                     }
                     else if constexpr(ActivationType == 0)
                     {
-                        v_c = v_c * arg.b_scale_e_n_(e, n) * arg.a_scale_t_(t);
+                        v_c = v_c * arg.b_scale_e_n_(e, n) * arg.a_scale_t_(t, 0);
                         if constexpr(is_same_v<BDataType, pk_i4_t>)
                         {
                             v_c_up *= 16;
                             v_c *= 16;
                         }
                         tensor_operation::element_wise::Gelu{}(v_c, v_c);
-                        v_c_up = v_c_up * arg.b_scale_e_n_(e, n + full_n) * arg.a_scale_t_(t);
+                        v_c_up = v_c_up * arg.b_scale_e_n_(e, n + full_n) * arg.a_scale_t_(t, 0);
                         arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
                     }
                 }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
index 58e4adfdfa..33239c94ec 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
@@ -144,8 +144,11 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
                             ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                     }
                     CDataType v_c{0};
-                    D0DataType v_d0 = arg.d0_(t, topk_id); // a
-                    D0DataType v_d1 = arg.d1_(e, n);       // b
+                    D0DataType v_d0 = arg.d0_.mDesc.GetNumOfDimension() == 3
+                                          ? arg.d0_(t, topk_id, 0)
+                                          : arg.d0_(t, topk_id); // a
+
+                    D0DataType v_d1 = arg.d1_(e, n); // b
                     if constexpr(MulRoutedWeight)
                     {
                         arg.c_element_op_(v_c, v_acc, v_d0, v_d1, v_topk_w);
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
index 28274a5154..cf30bc7dda 100644
--- a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
@@ -38,6 +38,10 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                       const CDEElementwiseOperation c_element_op)
 {
     using RowMajor = ck::tensor_layout::gemm::RowMajor;
+    using ElementDataTypeA =
+        ck::conditional_t<is_same_v<ComputeTypeA, ck::tf32_t>, float, ComputeTypeA>;
+    using ElementDataTypeB =
+        ck::conditional_t<is_same_v<ComputeTypeB, ck::tf32_t>, float, ComputeTypeB>;
 
     const int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
     const int col_idx = blockIdx.y * blockDim.y + threadIdx.y;
@@ -46,8 +50,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
     {
 
         AccDataType v_acc{0};
-        ComputeTypeA v_a{0};
-        ComputeTypeB v_b{0};
+        ElementDataTypeA v_a{0};
+        ElementDataTypeB v_b{0};
         CDataType v_c{0};
 
         for(int k_idx = 0; k_idx < k; ++k_idx)
@@ -76,7 +80,16 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
             // apply b_element_op
             b_element_op(v_b, p_b_grid[element_idx_b]);
             // multiply and accumulate
-            v_acc += type_convert<AccDataType>(v_a) * type_convert<AccDataType>(v_b);
+            if constexpr(is_same_v<ComputeTypeA, ComputeTypeB> &&
+                         is_same_v<ComputeTypeA, ck::tf32_t>)
+            { // only for tf32 now
+                v_acc += ck::type_convert<AccDataType>(ck::type_convert<ComputeTypeA>(v_a)) *
+                         ck::type_convert<AccDataType>(ck::type_convert<ComputeTypeB>(v_b));
+            }
+            else
+            {
+                v_acc += type_convert<AccDataType>(v_a) * type_convert<AccDataType>(v_b);
+            }
         }
         // apply c_element_op
         c_element_op(v_c, v_acc);
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index bf7f1b4fa4..ec1b379ead 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -16,6 +16,7 @@ namespace instance {
 // aliasing, for commonly used data type
 using F64  = double;
 using F32  = float;
+using TF32 = ck::tf32_t;
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using I8   = int8_t;
@@ -47,6 +48,9 @@ using BF16_Tuple    = ck::Tuple<BF16>;
 
 using F32_F32_Tuple = ck::Tuple<F32, F32>;
 
+// Generic layouts
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 // GEMM layout
 using Row  = ck::tensor_layout::gemm::RowMajor;
 using Col  = ck::tensor_layout::gemm::ColumnMajor;
@@ -54,6 +58,7 @@ using MFMA = ck::tensor_layout::gemm::MFMA;
 
 using Row_Tuple     = ck::Tuple<Row>;
 using Row_Row_Tuple = ck::Tuple<Row, Row>;
+using Row_Col_Tuple = ck::Tuple<Row, Col>;
 
 // Conv layout
 //
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
index 42ca8e755d..95d837f235 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,6 +16,70 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+
+#ifdef CK_USE_WMMA
+#ifdef CK_ENABLE_BF16
+void add_device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+void add_device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+#endif // CK_ENABLE_BF16
+#ifdef CK_ENABLE_FP16
+void add_device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Row,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+void add_device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Col,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_WMMA
+#ifdef CK_USE_XDL
 #ifdef CK_ENABLE_FP16
 void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
     std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
@@ -46,6 +110,8 @@ void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_i
                                                       PassThrough,
                                                       PassThrough,
                                                       PassThrough>>>& instances);
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_XDL
 template <typename ALayout,
           typename B0Layout,
           typename B1Layout,
@@ -86,7 +152,46 @@ struct DeviceOperationInstanceFactory<
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-
+#ifdef CK_USE_WMMA
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<B0DataType, bhalf_t> &&
+                     is_same_v<B1DataType, bhalf_t> && is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<B0Layout, Col> &&
+                         is_same_v<B1Layout, Row> && is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<B0Layout, Col> &&
+                              is_same_v<B1Layout, Col> && is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instance(
+                    op_ptrs);
+            }
+        }
+#endif // CK_ENABLE_BF16
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
+                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<B0Layout, Col> &&
+                         is_same_v<B1Layout, Row> && is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<B0Layout, Col> &&
+                              is_same_v<B1Layout, Col> && is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+                    op_ptrs);
+            }
+        }
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_WMMA
+#ifdef CK_USE_XDL
+#ifdef CK_ENABLE_FP16
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
                      is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t>)
         {
@@ -103,10 +208,11 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
             }
         }
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_XDL
         return op_ptrs;
     }
 };
-#endif
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add.hpp
index 030f3c2760..076474de36 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,6 +16,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+#if defined(CK_USE_XDL)
 void add_device_gemm_add_xdl_c_shuffle_f16_i8_f16_f16_mk_kn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
                                                     Row,
@@ -41,8 +42,37 @@ void add_device_gemm_add_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instances(
                                                     PassThrough,
                                                     PassThrough,
                                                     Add>>>&);
+#endif
 
-// GEMM + Add +
+#if defined(CK_USE_WMMA)
+void add_device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Add>>>&);
+
+void add_device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          BF16_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Add>>>&);
+#endif
+
+// GEMM + Add
 template <typename ALayout,
           typename BLayout,
           typename D0Layout,
@@ -52,17 +82,91 @@ template <typename ALayout,
           typename D0DataType,
           typename EDataType>
 struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
-                                                      BLayout,
-                                                      ck::Tuple<D0Layout>,
-                                                      ELayout,
-                                                      ADataType,
-                                                      BDataType,
-                                                      ck::Tuple<D0DataType>,
-                                                      EDataType,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      Add>>
+    ck::tensor_operation::device::DeviceGemmMultipleDSplitK<ALayout,
+                                                            BLayout,
+                                                            ck::Tuple<D0Layout>,
+                                                            ELayout,
+                                                            ADataType,
+                                                            BDataType,
+                                                            ck::Tuple<D0DataType>,
+                                                            EDataType,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            Add>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               ck::Tuple<D0Layout>,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               ck::Tuple<D0DataType>,
+                                               EDataType,
+                                               PassThrough,
+                                               PassThrough,
+                                               Add>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#if defined(CK_USE_XDL)
+        // No XDL instances for DeviceGemmMultipleDSplitK with Add at the moment
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+
+#if defined(CK_ENABLE_FP16)
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(op_ptrs);
+            }
+        }
+#endif
+
+#if defined(CK_ENABLE_BF16)
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<D0DataType, ck::bhalf_t> && is_same_v<EDataType, ck::bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#endif
+
+        return op_ptrs;
+    }
+};
+
+// GEMM + Add
+// DeviceGemmMultipleD specialization
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleD<ALayout,
+                                                          BLayout,
+                                                          ck::Tuple<D0Layout>,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          ck::Tuple<D0DataType>,
+                                                          EDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Add>>
 {
     using DeviceOp = DeviceGemmMultipleD<ALayout,
                                          BLayout,
@@ -80,6 +184,7 @@ struct DeviceOperationInstanceFactory<
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
+#ifdef CK_USE_XDL
 #if defined(CK_ENABLE_INT8) && defined(CK_ENABLE_FP16)
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, int8_t> &&
                      is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
@@ -104,10 +209,32 @@ struct DeviceOperationInstanceFactory<
         }
 #endif
 
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+        // Reuse DeviceGemmMultipleDSplitK instances
+        using Wrapper = DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                                         BLayout,
+                                                         ck::Tuple<D0Layout>,
+                                                         ELayout,
+                                                         ADataType,
+                                                         BDataType,
+                                                         ck::Tuple<D0DataType>,
+                                                         EDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Add>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif // CK_USE_WMMA
+
         return op_ptrs;
     }
 };
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
index 99b2ad1315..33a01cb68b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -11,11 +11,13 @@
 
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 
+#if defined(CK_ENABLE_FP16)
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
+#if defined(CK_USE_XDL)
 void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
                                                     Row,
@@ -67,8 +69,64 @@ void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn
                                                     PassThrough,
                                                     PassThrough,
                                                     AddAddFastGelu>>>&);
+#endif // CK_USE_XDL
 
-// GEMM + Add + Add + FastGelu
+#if defined(CK_USE_WMMA)
+void add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddAddFastGelu>>>&);
+
+void add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddAddFastGelu>>>&);
+
+void add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddAddFastGelu>>>&);
+
+void add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddAddFastGelu>>>&);
+#endif // CK_USE_WMMA
+
+// GEMM + Add + FastGelu
+// DeviceGemmMultipleDSplitK specialization
 template <typename ALayout,
           typename BLayout,
           typename D0Layout,
@@ -79,18 +137,100 @@ template <typename ALayout,
           typename D0DataType,
           typename D1DataType,
           typename EDataType>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<
-    ALayout,
-    BLayout,
-    ck::Tuple<D0Layout, D1Layout>,
-    ELayout,
-    ADataType,
-    BDataType,
-    ck::Tuple<D0DataType, D1DataType>,
-    EDataType,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::AddAddFastGelu>>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleDSplitK<ALayout,
+                                                                BLayout,
+                                                                ck::Tuple<D0Layout, D1Layout>,
+                                                                ELayout,
+                                                                ADataType,
+                                                                BDataType,
+                                                                ck::Tuple<D0DataType, D1DataType>,
+                                                                EDataType,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddAddFastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               ck::Tuple<D0Layout, D1Layout>,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               ck::Tuple<D0DataType, D1DataType>,
+                                               EDataType,
+                                               PassThrough,
+                                               PassThrough,
+                                               AddAddFastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#if defined(CK_USE_XDL)
+        // No XDL instances for DeviceGemmMultipleDSplitK with AddFastGelu at the moment
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+        constexpr bool IsAllDRowLayout = is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row>;
+        constexpr bool IsAllDFloat16 =
+            is_same_v<D0DataType, half_t> && is_same_v<D1DataType, half_t>;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<EDataType, half_t> && IsAllDRowLayout && IsAllDFloat16)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif // CK_USE_WMMA
+
+        return op_ptrs;
+    }
+};
+
+// GEMM + Add + Add + FastGelu
+// DeviceGemmMultipleD specialization
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleD<ALayout,
+                                                          BLayout,
+                                                          ck::Tuple<D0Layout, D1Layout>,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          ck::Tuple<D0DataType, D1DataType>,
+                                                          EDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddAddFastGelu>>
 {
     using DeviceOp = DeviceGemmMultipleD<ALayout,
                                          BLayout,
@@ -100,47 +240,69 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                                          BDataType,
                                          ck::Tuple<D0DataType, D1DataType>,
                                          EDataType,
-                                         ck::tensor_operation::element_wise::PassThrough,
-                                         ck::tensor_operation::element_wise::PassThrough,
-                                         ck::tensor_operation::element_wise::AddAddFastGelu>;
+                                         PassThrough,
+                                         PassThrough,
+                                         AddAddFastGelu>;
 
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
+#if defined(CK_USE_XDL)
+        constexpr bool IsAllDRowLayout = is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row>;
+        constexpr bool IsAllDFloat16 =
+            is_same_v<D0DataType, half_t> && is_same_v<D1DataType, half_t>;
+
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
-                     is_same_v<D0DataType, half_t> && is_same_v<D1DataType, half_t> &&
-                     is_same_v<EDataType, half_t>)
+                     is_same_v<EDataType, half_t> && IsAllDRowLayout && IsAllDFloat16)
         {
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
-                         is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
                          is_same_v<ELayout, Row>)
             {
                 add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
                     op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
-                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
                               is_same_v<ELayout, Row>)
             {
                 add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
                     op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
-                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
                               is_same_v<ELayout, Row>)
             {
                 add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
                     op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
-                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
                               is_same_v<ELayout, Row>)
             {
                 add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
                     op_ptrs);
             }
         }
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+        // Reuse DeviceGemmMultipleDSplitK instances
+        using Wrapper = DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                                         BLayout,
+                                                         ck::Tuple<D0Layout, D1Layout>,
+                                                         ELayout,
+                                                         ADataType,
+                                                         BDataType,
+                                                         ck::Tuple<D0DataType, D1DataType>,
+                                                         EDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         AddAddFastGelu>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif // CK_USE_WMMA
 
         return op_ptrs;
     }
@@ -150,3 +312,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif // CK_ENABLE_FP16
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
index 555b52de75..cb77e8f5e0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,6 +16,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+#if defined(CK_USE_XDL)
 void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
                                                     Row,
@@ -93,8 +94,64 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_in
                                                     PassThrough,
                                                     PassThrough,
                                                     AddFastGelu>>>&);
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>&);
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>&);
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>&);
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>&);
+#endif // CK_USE_WMMA
 
 // GEMM + Add + FastGelu
+// DeviceGemmMultipleDSplitK specialization
 template <typename ALayout,
           typename BLayout,
           typename D0Layout,
@@ -103,18 +160,97 @@ template <typename ALayout,
           typename BDataType,
           typename D0DataType,
           typename EDataType>
-struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
-                                                      BLayout,
-                                                      ck::Tuple<D0Layout>,
-                                                      ELayout,
-                                                      ADataType,
-                                                      BDataType,
-                                                      ck::Tuple<D0DataType>,
-                                                      EDataType,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      AddFastGelu>>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleDSplitK<ALayout,
+                                                                BLayout,
+                                                                ck::Tuple<D0Layout>,
+                                                                ELayout,
+                                                                ADataType,
+                                                                BDataType,
+                                                                ck::Tuple<D0DataType>,
+                                                                EDataType,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddFastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               ck::Tuple<D0Layout>,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               ck::Tuple<D0DataType>,
+                                               EDataType,
+                                               PassThrough,
+                                               PassThrough,
+                                               AddFastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#if defined(CK_USE_XDL)
+        // No XDL instances for DeviceGemmMultipleDSplitK with AddFastGelu at the moment
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+#if defined(CK_ENABLE_FP16)
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_WMMA
+
+        return op_ptrs;
+    }
+};
+
+// GEMM + Add + FastGelu
+// DeviceGemmMultipleD specialization
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleD<ALayout,
+                                                          BLayout,
+                                                          ck::Tuple<D0Layout>,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          ck::Tuple<D0DataType>,
+                                                          EDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>
 {
     using DeviceOp = DeviceGemmMultipleD<ALayout,
                                          BLayout,
@@ -132,7 +268,8 @@ struct DeviceOperationInstanceFactory<
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
-#if defined(CK_ENABLE_INT8) && defined(CK_ENABLE_FP16)
+#if defined(CK_USE_XDL)
+#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_INT8)
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, int8_t> &&
                      is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
         {
@@ -143,7 +280,7 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
             }
         }
-#endif
+#endif // CK_ENABLE_FP16 && CK_ENABLE_INT8
 
 #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
         if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, int8_t> &&
@@ -156,8 +293,9 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
             }
         }
-#endif
+#endif // CK_ENABLE_BF16 && CK_ENABLE_INT8
 
+#if defined(CK_ENABLE_FP16)
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                      is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
         {
@@ -186,6 +324,29 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
             }
         }
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+        // Reuse DeviceGemmMultipleDSplitK instances
+        using Wrapper = DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                                         BLayout,
+                                                         ck::Tuple<D0Layout>,
+                                                         ELayout,
+                                                         ADataType,
+                                                         BDataType,
+                                                         ck::Tuple<D0DataType>,
+                                                         EDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         AddFastGelu>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif // CK_USE_WMMA
 
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp
index 481915d00b..7a38f43a9a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,6 +19,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+#if defined(CK_USE_XDL)
 void add_device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
                                                     Row,
@@ -70,6 +71,145 @@ void add_device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_
                                                     PassThrough,
                                                     PassThrough,
                                                     AddMultiply>>>&);
+#endif
+
+#if defined(CK_USE_WMMA)
+void add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddMultiply>>>&);
+
+void add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddMultiply>>>&);
+
+void add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddMultiply>>>&);
+
+void add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddMultiply>>>&);
+#endif
+
+// GEMM + Add + Multiply
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleDSplitK<
+    ALayout,
+    BLayout,
+    ck::Tuple<D0Layout, D1Layout>,
+    ELayout,
+    ADataType,
+    BDataType,
+    ck::Tuple<D0DataType, D1DataType>,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::AddMultiply>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               ck::Tuple<D0Layout, D1Layout>,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               ck::Tuple<D0DataType, D1DataType>,
+                                               EDataType,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::AddMultiply>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_XDL
+
+#endif
+
+#if defined(CK_USE_WMMA)
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<D0DataType, half_t> && is_same_v<D1DataType, half_t> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+
+        return op_ptrs;
+    }
+};
 
 // GEMM + Add + Multiply
 template <typename ALayout,
@@ -111,6 +251,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
+#ifdef CK_USE_XDL
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                      is_same_v<D0DataType, half_t> && is_same_v<D1DataType, half_t> &&
                      is_same_v<EDataType, half_t>)
@@ -144,6 +285,27 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                     op_ptrs);
             }
         }
+#endif
+#if defined(CK_USE_WMMA)
+        // Reuse DeviceGemmMultipleDSplitK instances
+        using Wrapper = DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                                         BLayout,
+                                                         ck::Tuple<D0Layout, D1Layout>,
+                                                         ELayout,
+                                                         ADataType,
+                                                         BDataType,
+                                                         ck::Tuple<D0DataType, D1DataType>,
+                                                         EDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         AddMultiply>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif // CK_USE_WMMA
 
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu.hpp
index 293e14b811..4e706bb0c5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,6 +16,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+#if defined(CK_USE_XDL)
 void add_device_gemm_add_relu_xdl_c_shuffle_f16_i8_f16_f16_mk_kn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
                                                     Row,
@@ -41,6 +42,35 @@ void add_device_gemm_add_relu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instan
                                                     PassThrough,
                                                     PassThrough,
                                                     AddRelu>>>&);
+#endif
+
+#if defined(CK_USE_WMMA)
+void add_device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddRelu>>>&);
+
+void add_device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          BF16_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddRelu>>>&);
+#endif
 
 // GEMM + Add + Relu
 template <typename ALayout,
@@ -52,17 +82,92 @@ template <typename ALayout,
           typename D0DataType,
           typename EDataType>
 struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
-                                                      BLayout,
-                                                      ck::Tuple<D0Layout>,
-                                                      ELayout,
-                                                      ADataType,
-                                                      BDataType,
-                                                      ck::Tuple<D0DataType>,
-                                                      EDataType,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      AddRelu>>
+    ck::tensor_operation::device::DeviceGemmMultipleDSplitK<ALayout,
+                                                            BLayout,
+                                                            ck::Tuple<D0Layout>,
+                                                            ELayout,
+                                                            ADataType,
+                                                            BDataType,
+                                                            ck::Tuple<D0DataType>,
+                                                            EDataType,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            AddRelu>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               ck::Tuple<D0Layout>,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               ck::Tuple<D0DataType>,
+                                               EDataType,
+                                               PassThrough,
+                                               PassThrough,
+                                               AddRelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#if defined(CK_USE_XDL)
+        // No XDL instances for DeviceGemmMultipleDSplitK with AddRelu at the moment
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+
+#if defined(CK_ENABLE_FP16)
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+
+#if defined(CK_ENABLE_BF16)
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<D0DataType, ck::bhalf_t> && is_same_v<EDataType, ck::bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#endif
+
+        return op_ptrs;
+    }
+};
+
+// GEMM + Add + Relu
+// DeviceGemmMultipleD specialization
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleD<ALayout,
+                                                          BLayout,
+                                                          ck::Tuple<D0Layout>,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          ck::Tuple<D0DataType>,
+                                                          EDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddRelu>>
 {
     using DeviceOp = DeviceGemmMultipleD<ALayout,
                                          BLayout,
@@ -80,6 +185,7 @@ struct DeviceOperationInstanceFactory<
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
+#ifdef CK_USE_XDL
 #if defined(CK_ENABLE_INT8) && defined(CK_ENABLE_FP16)
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, int8_t> &&
                      is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
@@ -106,10 +212,32 @@ struct DeviceOperationInstanceFactory<
         }
 #endif
 
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+        // Reuse DeviceGemmMultipleDSplitK instances
+        using Wrapper = DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                                         BLayout,
+                                                         ck::Tuple<D0Layout>,
+                                                         ELayout,
+                                                         ADataType,
+                                                         BDataType,
+                                                         ck::Tuple<D0DataType>,
+                                                         EDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         AddRelu>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif // CK_USE_WMMA
+
         return op_ptrs;
     }
 };
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_silu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_silu.hpp
index fbf45852ce..0b140fb07d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_silu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_silu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,6 +16,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+#if defined(CK_USE_XDL)
 void add_device_gemm_add_silu_xdl_c_shuffle_f16_i8_f16_f16_mk_kn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
                                                     Row,
@@ -41,6 +42,107 @@ void add_device_gemm_add_silu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instan
                                                     PassThrough,
                                                     PassThrough,
                                                     AddSilu>>>&);
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+void add_device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddSilu>>>&);
+
+void add_device_gemm_add_silu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          BF16_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddSilu>>>&);
+#endif // CK_USE_WMMA
+
+// GEMM + Add + Silu
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleDSplitK<ALayout,
+                                                            BLayout,
+                                                            ck::Tuple<D0Layout>,
+                                                            ELayout,
+                                                            ADataType,
+                                                            BDataType,
+                                                            ck::Tuple<D0DataType>,
+                                                            EDataType,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            AddSilu>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               ck::Tuple<D0Layout>,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               ck::Tuple<D0DataType>,
+                                               EDataType,
+                                               PassThrough,
+                                               PassThrough,
+                                               AddSilu>;
+
+    static auto GetInstances()
+    {
+
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#if defined(CK_USE_XDL)
+        // no split-k xdl implementations
+#endif // CL_USE_XDL
+#if defined(CK_USE_WMMA)
+#if defined(CK_ENABLE_FP16)
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif // CK_ENABLE_FP16
+#if defined(CK_ENABLE_BF16)
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<D0DataType, ck::bhalf_t> && is_same_v<EDataType, ck::bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_silu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#endif // CK_USE_WMMA
+        return op_ptrs;
+    }
+};
 
 // GEMM + Add + Silu
 template <typename ALayout,
@@ -78,8 +180,11 @@ struct DeviceOperationInstanceFactory<
 
     static auto GetInstances()
     {
+
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
+#if defined(CK_USE_XDL)
+
 #if defined(CK_ENABLE_INT8) && defined(CK_ENABLE_FP16)
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, int8_t> &&
                      is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
@@ -105,7 +210,28 @@ struct DeviceOperationInstanceFactory<
             }
         }
 #endif
+#endif // CL_USE_XDL
+#if defined(CK_USE_WMMA)
+        // Reuse DeviceGemmMultipleDSplitK instances
+        using Wrapper = DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                                         BLayout,
+                                                         ck::Tuple<D0Layout>,
+                                                         ELayout,
+                                                         ADataType,
+                                                         BDataType,
+                                                         ck::Tuple<D0DataType>,
+                                                         EDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         AddSilu>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
 
+#endif // CK_USE_WMMA
         return op_ptrs;
     }
 };
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
index 6ee88bd855..5c58a7f239 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
@@ -16,7 +16,8 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-#if defined(CK_ENABLE_FP16) && defined(CK_USE_XDL)
+#if defined(CK_USE_XDL)
+#if defined(CK_ENABLE_FP16)
 void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
                                                     Row,
@@ -68,8 +69,11 @@ void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance
                                                     PassThrough,
                                                     PassThrough,
                                                     Bilinear>>>& instances);
-#endif
-#if defined(CK_ENABLE_INT8) && defined(CK_USE_WMMA)
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+#if defined(CK_ENABLE_INT8)
 void add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
                                                     Row,
@@ -121,7 +125,63 @@ void add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances(
                                                     PassThrough,
                                                     PassThrough,
                                                     Bilinear>>>& instances);
-#endif
+#endif // CK_ENABLE_INT8
+
+#if defined(CK_ENABLE_FP16)
+void add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Bilinear>>>& instances);
+
+void add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Bilinear>>>& instances);
+
+void add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Bilinear>>>& instances);
+
+void add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Bilinear>>>& instances);
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_WMMA
+
 // GEMM + Bilinear
 template <typename ALayout,
           typename BLayout,
@@ -131,18 +191,95 @@ template <typename ALayout,
           typename BDataType,
           typename DDataType,
           typename EDataType>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<
-    ALayout,
-    BLayout,
-    ck::Tuple<DLayout>,
-    ELayout,
-    ADataType,
-    BDataType,
-    ck::Tuple<DDataType>,
-    EDataType,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::Bilinear>>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleDSplitK<ALayout,
+                                                                BLayout,
+                                                                ck::Tuple<DLayout>,
+                                                                ELayout,
+                                                                ADataType,
+                                                                BDataType,
+                                                                ck::Tuple<DDataType>,
+                                                                EDataType,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Bilinear>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               ck::Tuple<DLayout>,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               ck::Tuple<DDataType>,
+                                               EDataType,
+                                               PassThrough,
+                                               PassThrough,
+                                               Bilinear>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#if defined(CK_USE_XDL)
+        // No XDL instances for DeviceGemmMultipleDSplitK with AddBilinear at the moment
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+#if defined(CK_ENABLE_FP16)
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<DDataType, half_t> && is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_WMMA
+
+        return op_ptrs;
+    }
+};
+
+// GEMM + Bilinear
+template <typename ALayout,
+          typename BLayout,
+          typename DLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleD<ALayout,
+                                                          BLayout,
+                                                          ck::Tuple<DLayout>,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          ck::Tuple<DDataType>,
+                                                          EDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Bilinear>>
 {
     using DeviceOp = DeviceGemmMultipleD<ALayout,
                                          BLayout,
@@ -152,14 +289,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                                          BDataType,
                                          ck::Tuple<DDataType>,
                                          EDataType,
-                                         ck::tensor_operation::element_wise::PassThrough,
-                                         ck::tensor_operation::element_wise::PassThrough,
-                                         ck::tensor_operation::element_wise::Bilinear>;
+                                         PassThrough,
+                                         PassThrough,
+                                         Bilinear>;
 
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-#if defined(CK_ENABLE_FP16) && defined(CK_USE_XDL)
+#if defined(CK_USE_XDL)
+#if defined(CK_ENABLE_FP16)
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                      is_same_v<DDataType, half_t> && is_same_v<EDataType, half_t>)
         {
@@ -188,8 +326,31 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                     op_ptrs);
             }
         }
-#endif
-#if defined(CK_ENABLE_INT8) && defined(CK_USE_WMMA)
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+        // Reuse DeviceGemmMultipleDSplitK instances
+        using Wrapper = DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                                         BLayout,
+                                                         ck::Tuple<DLayout>,
+                                                         ELayout,
+                                                         ADataType,
+                                                         BDataType,
+                                                         ck::Tuple<DDataType>,
+                                                         EDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+
+        // Bilinear wmma i8 instances are using DeviceGemmMultipleD interface.
+#if defined(CK_ENABLE_INT8)
         if constexpr(is_same_v<ADataType, std::int8_t> && is_same_v<BDataType, std::int8_t> &&
                      is_same_v<DDataType, std::int8_t> && is_same_v<EDataType, std::int8_t>)
         {
@@ -214,7 +375,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                 add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances(op_ptrs);
             }
         }
-#endif
+#endif // CK_ENABLE_INT8
+#endif // CK_USE_WMMA
         return op_ptrs;
     }
 };
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
index f6612fc259..c2489992b1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,6 +16,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+#if defined(CK_USE_XDL)
 void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
                                                     Row,
@@ -67,6 +68,132 @@ void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
                                                     PassThrough,
                                                     PassThrough,
                                                     FastGelu>>>&);
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+void add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          Empty_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          FastGelu>>>&);
+
+void add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          Empty_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          FastGelu>>>&);
+
+void add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          Empty_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          FastGelu>>>&);
+
+void add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          Empty_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          FastGelu>>>&);
+#endif // CK_USE_WMMA
+
+// GEMM + Add + FastGelu
+// DeviceGemmMultipleDSplitK specialization
+template <typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleDSplitK<ALayout,
+                                                                BLayout,
+                                                                Empty_Tuple,
+                                                                ELayout,
+                                                                ADataType,
+                                                                BDataType,
+                                                                Empty_Tuple,
+                                                                EDataType,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                FastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               Empty_Tuple,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               Empty_Tuple,
+                                               EDataType,
+                                               PassThrough,
+                                               PassThrough,
+                                               FastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#if defined(CK_USE_XDL)
+        // No XDL instances for DeviceGemmMultipleDSplitK with AddFastGelu at the moment
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+#endif // CK_USE_WMMA
+
+        return op_ptrs;
+    }
+};
 
 // GEMM + FastGelu
 template <typename ALayout,
@@ -75,17 +202,17 @@ template <typename ALayout,
           typename ADataType,
           typename BDataType,
           typename EDataType>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
-                                                                                        BLayout,
-                                                                                        Empty_Tuple,
-                                                                                        ELayout,
-                                                                                        ADataType,
-                                                                                        BDataType,
-                                                                                        Empty_Tuple,
-                                                                                        EDataType,
-                                                                                        PassThrough,
-                                                                                        PassThrough,
-                                                                                        FastGelu>>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleD<ALayout,
+                                                          BLayout,
+                                                          Empty_Tuple,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          Empty_Tuple,
+                                                          EDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          FastGelu>>
 {
     using DeviceOp = DeviceGemmMultipleD<ALayout,
                                          BLayout,
@@ -103,6 +230,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
+#if defined(CK_USE_XDL)
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                      is_same_v<EDataType, half_t>)
         {
@@ -127,6 +255,28 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                 add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
             }
         }
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+        // Reuse DeviceGemmMultipleDSplitK instances
+        using Wrapper = DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                                         BLayout,
+                                                         Empty_Tuple,
+                                                         ELayout,
+                                                         ADataType,
+                                                         BDataType,
+                                                         Empty_Tuple,
+                                                         EDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         FastGelu>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif // CK_USE_WMMA
 
         return op_ptrs;
     }
@@ -136,4 +286,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
+#endif // CK_ENABLE_FP16
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp
index 6e2950180d..3ebfdfa0d3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -17,11 +17,229 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using Multiply    = ck::tensor_operation::element_wise::Multiply;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+using Multiply            = ck::tensor_operation::element_wise::Multiply;
+using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu         = ck::tensor_operation::element_wise::AddFastGelu;
+using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
 
 #ifdef CK_ENABLE_INT8
+
+#ifdef CK_USE_WMMA
+// RRR
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Row, Row>,
+                                                            ck::Tuple<Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            AddFastGelu>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Row, Row>,
+                                                            ck::Tuple<Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            Add>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Row, Row>,
+                                                            ck::Tuple<>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            FastGelu>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Row, Row>,
+                                                            ck::Tuple<>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            PassThrough>>>& instances);
+
+// RCR
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Col, Col>,
+                                                            ck::Tuple<Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            AddFastGelu>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Col, Col>,
+                                                            ck::Tuple<Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            Add>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Col, Col>,
+                                                            ck::Tuple<>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            FastGelu>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Col, Col>,
+                                                            ck::Tuple<>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            PassThrough>>>& instances);
+
+// CRR
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Col>,
+                                                            ck::Tuple<Row, Row>,
+                                                            ck::Tuple<Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            AddFastGelu>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_km_kn_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Col>,
+                                                            ck::Tuple<Row, Row>,
+                                                            ck::Tuple<Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            Add>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Col>,
+                                                            ck::Tuple<Row, Row>,
+                                                            ck::Tuple<>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            FastGelu>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_km_kn_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Col>,
+                                                            ck::Tuple<Row, Row>,
+                                                            ck::Tuple<>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8, BF16>,
+                                                            ck::Tuple<>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            Multiply,
+                                                            PassThrough>>>& instances);
+
+// Multiply
+void add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Row>,
+                                                            ck::Tuple<Row, Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8>,
+                                                            ck::Tuple<BF16, BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            MultiplyAddFastGelu>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Row>,
+                                                            ck::Tuple<Row, Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8>,
+                                                            ck::Tuple<BF16, BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            MultiplyAdd>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Row>,
+                                                            ck::Tuple<Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8>,
+                                                            ck::Tuple<BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            MultiplyFastGelu>>>& instances);
+
+void add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<ck::Tuple<Row>,
+                                                            ck::Tuple<Row>,
+                                                            ck::Tuple<Row>,
+                                                            Row,
+                                                            ck::Tuple<BF16>,
+                                                            ck::Tuple<I8>,
+                                                            ck::Tuple<BF16>,
+                                                            BF16,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            Multiply>>>& instances);
+
+#endif
+
+#ifdef CK_USE_XDL
 // RRR
 void add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
@@ -198,7 +416,7 @@ void add_device_gemm_xdl_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_i
 void add_device_gemm_xdl_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleABD<ck::Tuple<Row>,
                                                       ck::Tuple<Row>,
-                                                      ck::Tuple<Row>,
+                                                      ck::Tuple<Row, Row>,
                                                       Row,
                                                       ck::Tuple<BF16>,
                                                       ck::Tuple<I8>,
@@ -233,10 +451,88 @@ void add_device_gemm_xdl_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instances(
                                                       PassThrough,
                                                       PassThrough,
                                                       Multiply>>>& instances);
-
+#endif
 #endif
 
 // GEMM + Add + Gelu
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABDSplitK<AsLayout,
+                                                              BsLayout,
+                                                              DsLayout,
+                                                              ELayout,
+                                                              AsDataType,
+                                                              BsDataType,
+                                                              DsDataType,
+                                                              EDataType,
+                                                              PassThrough,
+                                                              Multiply,
+                                                              AddFastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleABDSplitK<AsLayout,
+                                                 BsLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 PassThrough,
+                                                 Multiply,
+                                                 AddFastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
+        // No XDL instances for DeviceGemmMultipleABDSplitK with Add at the moment
+#endif
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_v1_instances(
+                    op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#endif
+
+        return op_ptrs;
+    }
+};
+
 template <typename AsLayout,
           typename BsLayout,
           typename DsLayout,
@@ -275,6 +571,7 @@ struct DeviceOperationInstanceFactory<
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
 #ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
         if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
                      is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
                      is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
@@ -300,6 +597,27 @@ struct DeviceOperationInstanceFactory<
                 add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances(op_ptrs);
             }
         }
+#endif
+#ifdef CK_USE_WMMA
+        using Wrapper = DeviceGemmMultipleABDSplitKWrapper<AsLayout,
+                                                           BsLayout,
+                                                           DsLayout,
+                                                           ELayout,
+                                                           AsDataType,
+                                                           BsDataType,
+                                                           DsDataType,
+                                                           EDataType,
+                                                           PassThrough,
+                                                           Multiply,
+                                                           AddFastGelu>;
+
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif // CK_USE_WMMA
 #endif
 
         return op_ptrs;
@@ -307,6 +625,81 @@ struct DeviceOperationInstanceFactory<
 };
 
 // GEMM + Add
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABDSplitK<AsLayout,
+                                                              BsLayout,
+                                                              DsLayout,
+                                                              ELayout,
+                                                              AsDataType,
+                                                              BsDataType,
+                                                              DsDataType,
+                                                              EDataType,
+                                                              PassThrough,
+                                                              Multiply,
+                                                              Add>>
+{
+    using DeviceOp = DeviceGemmMultipleABDSplitK<AsLayout,
+                                                 BsLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 PassThrough,
+                                                 Multiply,
+                                                 Add>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
+        // No XDL instances for DeviceGemmMultipleABDSplitK with Add at the moment
+#endif
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_km_kn_mn_bias_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_v1_instances(op_ptrs);
+            }
+        }
+#endif
+
+#endif
+        return op_ptrs;
+    }
+};
+
 template <typename AsLayout,
           typename BsLayout,
           typename DsLayout,
@@ -345,6 +738,7 @@ struct DeviceOperationInstanceFactory<
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
 #ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
         if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
                      is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
                      is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
@@ -372,11 +766,107 @@ struct DeviceOperationInstanceFactory<
         }
 #endif
 
+#ifdef CK_USE_WMMA
+        using Wrapper = DeviceGemmMultipleABDSplitKWrapper<AsLayout,
+                                                           BsLayout,
+                                                           DsLayout,
+                                                           ELayout,
+                                                           AsDataType,
+                                                           BsDataType,
+                                                           DsDataType,
+                                                           EDataType,
+                                                           PassThrough,
+                                                           Multiply,
+                                                           Add>;
+
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif // CK_USE_WMMA
+#endif
+
         return op_ptrs;
     }
 };
 
 // GEMM + Gelu
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABDSplitK<AsLayout,
+                                                              BsLayout,
+                                                              DsLayout,
+                                                              ELayout,
+                                                              AsDataType,
+                                                              BsDataType,
+                                                              DsDataType,
+                                                              EDataType,
+                                                              PassThrough,
+                                                              Multiply,
+                                                              FastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleABDSplitK<AsLayout,
+                                                 BsLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 PassThrough,
+                                                 Multiply,
+                                                 FastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
+        // No XDL instances for DeviceGemmMultipleABDSplitK with Add at the moment
+#endif
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_v1_instances(op_ptrs);
+            }
+        }
+#endif
+#endif
+        return op_ptrs;
+    }
+};
+
 template <typename AsLayout,
           typename BsLayout,
           typename DsLayout,
@@ -415,6 +905,7 @@ struct DeviceOperationInstanceFactory<
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
 #ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
         if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
                      is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
                      is_same_v<DsDataType, ck::Tuple<>> && is_same_v<EDataType, BF16>)
@@ -442,11 +933,106 @@ struct DeviceOperationInstanceFactory<
         }
 #endif
 
+#ifdef CK_USE_WMMA
+        using Wrapper = DeviceGemmMultipleABDSplitKWrapper<AsLayout,
+                                                           BsLayout,
+                                                           DsLayout,
+                                                           ELayout,
+                                                           AsDataType,
+                                                           BsDataType,
+                                                           DsDataType,
+                                                           EDataType,
+                                                           PassThrough,
+                                                           Multiply,
+                                                           FastGelu>;
+
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif
+#endif
         return op_ptrs;
     }
 };
 
 // GEMM
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABDSplitK<AsLayout,
+                                                              BsLayout,
+                                                              DsLayout,
+                                                              ELayout,
+                                                              AsDataType,
+                                                              BsDataType,
+                                                              DsDataType,
+                                                              EDataType,
+                                                              PassThrough,
+                                                              Multiply,
+                                                              PassThrough>>
+{
+    using DeviceOp = DeviceGemmMultipleABDSplitK<AsLayout,
+                                                 BsLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 PassThrough,
+                                                 Multiply,
+                                                 PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
+        // No XDL instances for DeviceGemmMultipleABDSplitK at the moment
+#endif
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
+                     is_same_v<DsDataType, ck::Tuple<>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Col>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row, Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_km_kn_mn_v1_instances(op_ptrs);
+            }
+
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Col, Col>> &&
+                         is_same_v<DsLayout, ck::Tuple<>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_v1_instances(op_ptrs);
+            }
+        }
+#endif
+#endif
+        return op_ptrs;
+    }
+};
+
 template <typename AsLayout,
           typename BsLayout,
           typename DsLayout,
@@ -485,6 +1071,7 @@ struct DeviceOperationInstanceFactory<
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
 #ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
         if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
                      is_same_v<BsDataType, ck::Tuple<I8, BF16>> &&
                      is_same_v<DsDataType, ck::Tuple<>> && is_same_v<EDataType, BF16>)
@@ -511,13 +1098,95 @@ struct DeviceOperationInstanceFactory<
             }
         }
 #endif
+#ifdef CK_USE_WMMA
+        using Wrapper = DeviceGemmMultipleABDSplitKWrapper<AsLayout,
+                                                           BsLayout,
+                                                           DsLayout,
+                                                           ELayout,
+                                                           AsDataType,
+                                                           BsDataType,
+                                                           DsDataType,
+                                                           EDataType,
+                                                           PassThrough,
+                                                           Multiply,
+                                                           PassThrough>;
 
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif
+#endif
         return op_ptrs;
     }
 };
 
 // Multiply
 // GEMM + Add + Gelu
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABDSplitK<AsLayout,
+                                                              BsLayout,
+                                                              DsLayout,
+                                                              ELayout,
+                                                              AsDataType,
+                                                              BsDataType,
+                                                              DsDataType,
+                                                              EDataType,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              MultiplyAddFastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleABDSplitK<AsLayout,
+                                                 BsLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 PassThrough,
+                                                 PassThrough,
+                                                 MultiplyAddFastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
+        // No XDL instances for DeviceGemmMultipleABDSplitK with Add at the moment
+#endif
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16, BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row, Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#endif
+
+        return op_ptrs;
+    }
+};
+
 template <typename AsLayout,
           typename BsLayout,
           typename DsLayout,
@@ -556,6 +1225,7 @@ struct DeviceOperationInstanceFactory<
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
 #ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
         if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
                      is_same_v<BsDataType, ck::Tuple<I8>> &&
                      is_same_v<DsDataType, ck::Tuple<BF16, BF16>> && is_same_v<EDataType, BF16>)
@@ -568,6 +1238,27 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
             }
         }
+#endif
+#ifdef CK_USE_WMMA
+        using Wrapper = DeviceGemmMultipleABDSplitKWrapper<AsLayout,
+                                                           BsLayout,
+                                                           DsLayout,
+                                                           ELayout,
+                                                           AsDataType,
+                                                           BsDataType,
+                                                           DsDataType,
+                                                           EDataType,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           MultiplyAddFastGelu>;
+
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif
 #endif
 
         return op_ptrs;
@@ -575,6 +1266,67 @@ struct DeviceOperationInstanceFactory<
 };
 
 // GEMM + Add
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABDSplitK<AsLayout,
+                                                              BsLayout,
+                                                              DsLayout,
+                                                              ELayout,
+                                                              AsDataType,
+                                                              BsDataType,
+                                                              DsDataType,
+                                                              EDataType,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              MultiplyAdd>>
+{
+    using DeviceOp = DeviceGemmMultipleABDSplitK<AsLayout,
+                                                 BsLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 PassThrough,
+                                                 PassThrough,
+                                                 MultiplyAdd>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
+        // No XDL instances for DeviceGemmMultipleABDSplitK with Add at the moment
+#endif
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16, BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row, Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#endif
+        return op_ptrs;
+    }
+};
+
 template <typename AsLayout,
           typename BsLayout,
           typename DsLayout,
@@ -613,6 +1365,7 @@ struct DeviceOperationInstanceFactory<
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
 #ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
         if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
                      is_same_v<BsDataType, ck::Tuple<I8>> &&
                      is_same_v<DsDataType, ck::Tuple<BF16, BF16>> && is_same_v<EDataType, BF16>)
@@ -625,6 +1378,27 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
             }
         }
+#endif
+#ifdef CK_USE_WMMA
+        using Wrapper = DeviceGemmMultipleABDSplitKWrapper<AsLayout,
+                                                           BsLayout,
+                                                           DsLayout,
+                                                           ELayout,
+                                                           AsDataType,
+                                                           BsDataType,
+                                                           DsDataType,
+                                                           EDataType,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           MultiplyAdd>;
+
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif
 #endif
 
         return op_ptrs;
@@ -632,6 +1406,68 @@ struct DeviceOperationInstanceFactory<
 };
 
 // GEMM + Gelu
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABDSplitK<AsLayout,
+                                                              BsLayout,
+                                                              DsLayout,
+                                                              ELayout,
+                                                              AsDataType,
+                                                              BsDataType,
+                                                              DsDataType,
+                                                              EDataType,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              MultiplyFastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleABDSplitK<AsLayout,
+                                                 BsLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 PassThrough,
+                                                 PassThrough,
+                                                 MultiplyFastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
+        // No XDL instances for DeviceGemmMultipleABDSplitK with Add at the moment
+#endif
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#endif
+
+        return op_ptrs;
+    }
+};
+
 template <typename AsLayout,
           typename BsLayout,
           typename DsLayout,
@@ -670,6 +1506,7 @@ struct DeviceOperationInstanceFactory<
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
 #ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
         if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
                      is_same_v<BsDataType, ck::Tuple<I8>> &&
                      is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
@@ -682,6 +1519,27 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
             }
         }
+#endif
+#ifdef CK_USE_WMMA
+        using Wrapper = DeviceGemmMultipleABDSplitKWrapper<AsLayout,
+                                                           BsLayout,
+                                                           DsLayout,
+                                                           ELayout,
+                                                           AsDataType,
+                                                           BsDataType,
+                                                           DsDataType,
+                                                           EDataType,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           MultiplyFastGelu>;
+
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif
 #endif
 
         return op_ptrs;
@@ -689,6 +1547,67 @@ struct DeviceOperationInstanceFactory<
 };
 
 // GEMM
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleABDSplitK<AsLayout,
+                                                              BsLayout,
+                                                              DsLayout,
+                                                              ELayout,
+                                                              AsDataType,
+                                                              BsDataType,
+                                                              DsDataType,
+                                                              EDataType,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Multiply>>
+{
+    using DeviceOp = DeviceGemmMultipleABDSplitK<AsLayout,
+                                                 BsLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 PassThrough,
+                                                 PassThrough,
+                                                 Multiply>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
+        // No XDL instances for DeviceGemmMultipleABDSplitK with Add at the moment
+#endif
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
+                     is_same_v<BsDataType, ck::Tuple<I8>> &&
+                     is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
+        {
+            if constexpr(is_same_v<AsLayout, ck::Tuple<Row>> &&
+                         is_same_v<BsLayout, ck::Tuple<Row>> &&
+                         is_same_v<DsLayout, ck::Tuple<Row>> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instances(op_ptrs);
+            }
+        }
+#endif
+#endif
+
+        return op_ptrs;
+    }
+};
+
 template <typename AsLayout,
           typename BsLayout,
           typename DsLayout,
@@ -727,6 +1646,7 @@ struct DeviceOperationInstanceFactory<
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
 #ifdef CK_ENABLE_INT8
+#ifdef CK_USE_XDL
         if constexpr(is_same_v<AsDataType, ck::Tuple<BF16>> &&
                      is_same_v<BsDataType, ck::Tuple<I8>> &&
                      is_same_v<DsDataType, ck::Tuple<BF16>> && is_same_v<EDataType, BF16>)
@@ -740,6 +1660,28 @@ struct DeviceOperationInstanceFactory<
         }
 #endif
 
+#ifdef CK_USE_WMMA
+        using Wrapper = DeviceGemmMultipleABDSplitKWrapper<AsLayout,
+                                                           BsLayout,
+                                                           DsLayout,
+                                                           ELayout,
+                                                           AsDataType,
+                                                           BsDataType,
+                                                           DsDataType,
+                                                           EDataType,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Multiply>;
+
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif
+#endif
+
         return op_ptrs;
     }
 };
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
index 64c74d4795..1d6ebd6307 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,6 +19,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+#if defined(CK_USE_XDL)
 void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
                                                     Row,
@@ -71,9 +72,64 @@ void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_m
                                                     PassThrough,
                                                     PassThrough,
                                                     MultiplyAdd>>>&);
-#endif
+#endif // CK_ENABLE_FP8
+#endif // CK_USE_XDL
+
+#if defined(CK_USE_WMMA)
+void add_device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyAdd>>>&);
+
+void add_device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyAdd>>>&);
+#ifdef CK_USE_WMMA_FP8
+void add_device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F8,
+                                                          F32_F32_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyAdd>>>&);
+
+void add_device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F8,
+                                                          F32_F32_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyAdd>>>&);
+#endif // CK_USE_WMMA_FP8
+#endif // CK_USE_WMMA
 
-// GEMM + Multiply + Add
 template <typename ALayout,
           typename BLayout,
           typename D0Layout,
@@ -84,18 +140,107 @@ template <typename ALayout,
           typename D0DataType,
           typename D1DataType,
           typename EDataType>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<
-    ALayout,
-    BLayout,
-    ck::Tuple<D0Layout, D1Layout>,
-    ELayout,
-    ADataType,
-    BDataType,
-    ck::Tuple<D0DataType, D1DataType>,
-    EDataType,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::MultiplyAdd>>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleDSplitK<ALayout,
+                                                                BLayout,
+                                                                ck::Tuple<D0Layout, D1Layout>,
+                                                                ELayout,
+                                                                ADataType,
+                                                                BDataType,
+                                                                ck::Tuple<D0DataType, D1DataType>,
+                                                                EDataType,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                MultiplyAdd>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               ck::Tuple<D0Layout, D1Layout>,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               ck::Tuple<D0DataType, D1DataType>,
+                                               EDataType,
+                                               PassThrough,
+                                               PassThrough,
+                                               MultiplyAdd>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_XDL
+        // No XDL instances for DeviceGemmMultipleDSplitK with MultiplyAdd at the moment
+#endif // CK_USE_XDL
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<ADataType, F16> && is_same_v<BDataType, F16> &&
+                     is_same_v<D0DataType, F16> && is_same_v<D1DataType, F16> &&
+                     is_same_v<EDataType, F16>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif // CK_USE_WMMA
+#ifdef CK_USE_WMMA_FP8
+        if constexpr(is_same_v<ADataType, F16> && is_same_v<BDataType, F8> &&
+                     is_same_v<D0DataType, F32> && is_same_v<D1DataType, F32> &&
+                     is_same_v<EDataType, F16>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif // CK_USE_WMMA
+
+        return op_ptrs;
+    }
+};
+
+// DeviceGemmMultipleD specialization
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleD<ALayout,
+                                                          BLayout,
+                                                          ck::Tuple<D0Layout, D1Layout>,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          ck::Tuple<D0DataType, D1DataType>,
+                                                          EDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyAdd>>
 {
     using DeviceOp = DeviceGemmMultipleD<ALayout,
                                          BLayout,
@@ -105,17 +250,18 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                                          BDataType,
                                          ck::Tuple<D0DataType, D1DataType>,
                                          EDataType,
-                                         ck::tensor_operation::element_wise::PassThrough,
-                                         ck::tensor_operation::element_wise::PassThrough,
-                                         ck::tensor_operation::element_wise::MultiplyAdd>;
+                                         PassThrough,
+                                         PassThrough,
+                                         MultiplyAdd>;
 
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
-        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
-                     is_same_v<D0DataType, half_t> && is_same_v<D1DataType, half_t> &&
-                     is_same_v<EDataType, half_t>)
+#ifdef CK_USE_XDL
+        if constexpr(is_same_v<ADataType, F16> && is_same_v<BDataType, F16> &&
+                     is_same_v<D0DataType, F16> && is_same_v<D1DataType, F16> &&
+                     is_same_v<EDataType, F16>)
         {
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                          is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
@@ -133,10 +279,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
             }
         }
 
-#if defined CK_ENABLE_FP8
-        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
-                     is_same_v<D0DataType, float> && is_same_v<D1DataType, float> &&
-                     is_same_v<EDataType, half_t>)
+#ifdef CK_ENABLE_FP8
+        if constexpr(is_same_v<ADataType, F16> && is_same_v<BDataType, F8> &&
+                     is_same_v<D0DataType, F32> && is_same_v<D1DataType, F32> &&
+                     is_same_v<EDataType, F16>)
         {
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                          is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
@@ -153,7 +299,29 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                     op_ptrs);
             }
         }
-#endif
+#endif // CK_ENABLE_FP8
+#endif // CK_USE_XDL
+
+#ifdef CK_USE_WMMA
+        // Reuse DeviceGemmMultipleDSplitK instances
+        using Wrapper = DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                                         BLayout,
+                                                         ck::Tuple<D0Layout, D1Layout>,
+                                                         ELayout,
+                                                         ADataType,
+                                                         BDataType,
+                                                         ck::Tuple<D0DataType, D1DataType>,
+                                                         EDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         MultiplyAdd>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif // CK_USE_WMMA
 
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
index 6475b801b8..8a9d4bf933 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,6 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef CK_USE_XDL
 #ifdef CK_ENABLE_FP8
 #ifdef CK_ENABLE_BF16
 void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_instances_part1(
@@ -199,7 +200,7 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_i
                                                           PassThrough,
                                                           PassThrough,
                                                           MultiplyMultiply>>>& instances);
-#endif
+#endif // CK_ENABLE_BF16
 #ifdef CK_ENABLE_FP16
 void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
@@ -278,8 +279,8 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_kpadding_in
                                                           PassThrough,
                                                           PassThrough,
                                                           MultiplyMultiply>>>& instances);
-#endif
-#endif
+#endif // CK_ENABLE_FP16
+#endif // CK_ENABLE_FP8
 
 #ifdef CK_ENABLE_FP16
 void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instances_part1(
@@ -463,7 +464,7 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_kpadding_in
                                                           PassThrough,
                                                           PassThrough,
                                                           MultiplyMultiply>>>& instances);
-#endif
+#endif // CK_ENABLE_FP16
 
 #if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_INT8))
 void add_device_gemm_multiply_multiply_xdl_i8_i8_f16_mk_nk_mn_comp_default_instances(
@@ -544,7 +545,62 @@ void add_device_gemm_multiply_multiply_xdl_i8_i8_f16_mk_nk_mn_mem_v2_kpadding_in
                                                           PassThrough,
                                                           MultiplyMultiply>>>& instances);
 
-#endif
+#endif // CK_ENABLE_FP16 || CK_ENABLE_INT8
+#endif // CK_USE_XDL
+
+#ifdef CK_USE_WMMA
+void add_device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Col_Tuple,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_bf16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Col_Tuple,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          F32_F32_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Col_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          F32_F32_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_bf16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Col_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          F32_F32_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+#endif // CK_USE_WMMA
 
 template <typename ADataType,
           typename BDataType,
@@ -553,37 +609,36 @@ template <typename ADataType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleDSplitK<
-    ALayout,
-    BLayout,
-    Tuple<Row, Col>,
-    CLayout,
-    ADataType,
-    BDataType,
-    DsDataType,
-    CDataType,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::MultiplyMultiply>>
+struct DeviceOperationInstanceFactory<DeviceGemmMultipleDSplitK<ALayout,
+                                                                BLayout,
+                                                                Tuple<Row, Col>,
+                                                                CLayout,
+                                                                ADataType,
+                                                                BDataType,
+                                                                DsDataType,
+                                                                CDataType,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                MultiplyMultiply>>
 {
-    using DeviceOp =
-        DeviceGemmMultipleDSplitK<ALayout,
-                                  BLayout,
-                                  Tuple<Row, Col>,
-                                  CLayout,
-                                  ADataType,
-                                  BDataType,
-                                  DsDataType,
-                                  CDataType,
-                                  ck::tensor_operation::element_wise::PassThrough,
-                                  ck::tensor_operation::element_wise::PassThrough,
-                                  ck::tensor_operation::element_wise::MultiplyMultiply>;
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               Tuple<Row, Col>,
+                                               CLayout,
+                                               ADataType,
+                                               BDataType,
+                                               DsDataType,
+                                               CDataType,
+                                               PassThrough,
+                                               PassThrough,
+                                               MultiplyMultiply>;
 
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
-#ifdef CK_ENABLE_FP8
+#ifdef CK_USE_XDL
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || CK_USE_OCP_FP8 || defined(CK_USE_GFX94)
 #ifdef CK_ENABLE_BF16
         if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
                      is_same_v<CDataType, bhalf_t>)
@@ -624,7 +679,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                     op_ptrs);
             }
         }
-#endif
+#endif // CK_ENABLE_BF16
 #ifdef CK_ENABLE_FP16
         if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
                      is_same_v<CDataType, half_t>)
@@ -665,8 +720,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                     op_ptrs);
             }
         }
-#endif
-#endif
+#endif // CK_ENABLE_FP16
+#endif // CK_ENABLE_FP8
 #if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_INT8))
         if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
                      is_same_v<CDataType, half_t>)
@@ -691,6 +746,51 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
             }
         }
 #endif
+#endif // CK_USE_XDL
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_f16_km_nk_mn_instances(
+                    op_ptrs);
+            }
+        }
+        if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_bf16_km_nk_mn_instances(
+                    op_ptrs);
+            }
+        }
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_f16_km_nk_mn_instances(
+                    op_ptrs);
+            }
+        }
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_bf16_km_nk_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif // CK_USE_WMMA
+
         return op_ptrs;
     }
 };
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_reduce.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_reduce.hpp
index 7727489e51..430a4e52f4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_reduce.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -8,6 +8,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3r1.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
@@ -20,6 +21,7 @@ namespace instance {
 using DsLayout   = ck::Tuple<>;
 using DsDataType = ck::Tuple<>;
 
+#ifdef CK_USE_XDL
 #ifdef CK_ENABLE_FP16
 void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
@@ -326,7 +328,54 @@ void add_device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadd
                                                PassThrough,
                                                PassThrough,
                                                PassThrough>>>& instances);
+#endif
+#endif
 
+#ifdef CK_USE_WMMA
+#if defined(CK_ENABLE_FP16)
+void add_device_gemm_wmma_universal_reduce_f16_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances);
+#endif
+
+#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_INT8))
+void add_device_gemm_wmma_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               BF16,
+                                               I8,
+                                               DsDataType,
+                                               BF16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances);
+#endif
+
+#if defined(CK_ENABLE_BF16)
+void add_device_gemm_wmma_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               BF16,
+                                               BF16,
+                                               DsDataType,
+                                               BF16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances);
+#endif
 #endif
 
 template <typename ADataType,
@@ -373,6 +422,7 @@ struct DeviceOperationInstanceFactory<
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                          is_same_v<CLayout, Row>)
             {
+#ifdef CK_USE_XDL
                 add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_default_instances(
                     op_ptrs);
                 add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
@@ -395,6 +445,12 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
                 add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
                     op_ptrs);
+#endif
+
+#ifdef CK_USE_WMMA
+                add_device_gemm_wmma_universal_reduce_f16_f16_f16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+#endif
             }
         }
 #endif
@@ -406,6 +462,7 @@ struct DeviceOperationInstanceFactory<
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                          is_same_v<CLayout, Row>)
             {
+#ifdef CK_USE_XDL
                 add_device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_default_instances(
                     op_ptrs);
                 add_device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instances(
@@ -420,6 +477,12 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
                 add_device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
                     op_ptrs);
+#endif
+
+#ifdef CK_USE_WMMA
+                add_device_gemm_wmma_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+#endif
             }
         }
 #endif
@@ -430,6 +493,7 @@ struct DeviceOperationInstanceFactory<
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                          is_same_v<CLayout, Row>)
             {
+#ifdef CK_USE_XDL
                 add_device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
                     op_ptrs);
                 add_device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
@@ -444,6 +508,12 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
                 add_device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
                     op_ptrs);
+#endif
+
+#ifdef CK_USE_WMMA
+                add_device_gemm_wmma_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+#endif
             }
         }
 #endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
index 11a8ff8e91..f16a345e14 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
@@ -76,6 +76,47 @@ using device_grouped_conv_bwd_data_xdl_f16_16_16_instances =
         // clang-format on
         >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f16_optimized_loads_instances =
+    std::tuple<
+        // clang-format off
+        // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // A K1 one access for each thread per load 
+        // 32x32
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    64,  16,   16,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    32,   8,    8,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 4, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    16,   4,    4,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 4, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                8>,
+        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    64,  16,   16,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 16, 1, 16>,               2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    32,   8,    8,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 16, 1, 16>,               2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    16,   4,    4,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 16, 1, 16>,               2>,
+        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    64,  16,   16,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 8, 1, 32>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    32,   8,    8,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 8, 1, 32>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    16,   4,    4,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 16, 4>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 8, 1, 32>,                1>,
+        // 16x16      
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    64,  16,   16,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 2, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    32,   8,    8,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 2, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    16,   4,    4,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 2, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                4>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    64,  16,   16,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 32, 1, 8>,                2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    32,   8,    8,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 32, 1, 8>,                2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    16,   4,    4,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 32, 1, 8>,                2>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    64,  16,   16,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 16, 1, 16>,               1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    32,   8,    8,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 16, 1, 16>,               1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    16,   4,    4,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 16, 4>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 16, 1, 16>,               1>
+        // clang-format on
+        >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -178,6 +219,48 @@ using device_grouped_conv_bwd_data_xdl_bf16_16_16_instances = std::tuple<
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>
+
+    // clang-format on
+    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_bf16_optimized_loads_instances = std::tuple<
+    // clang-format off
+        // ##############################################|          NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|       Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // A K1 one access for each thread per load 
+        // 32x32
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    64,  16,   16,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    32,   8,    8,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 4, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    16,   4,    4,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 4, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                8>,
+        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    64,  16,   16,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 16, 1, 16>,               2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    32,   8,    8,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 16, 1, 16>,               2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    16,   4,    4,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 16, 1, 16>,               2>,
+        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    64,  16,   16,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 8, 1, 32>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    32,   8,    8,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 8, 1, 32>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    16,   4,    4,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 16, 4>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 8, 1, 32>,                1>,
+        // 16x16        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    64,  16,   16,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 2, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    32,   8,    8,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 2, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    16,   4,    4,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 2, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                4>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    64,  16,   16,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 32, 1, 8>,                2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    32,   8,    8,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 32, 1, 8>,                2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    16,   4,    4,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 32, 1, 8>,                2>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    64,  16,   16,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,              4,         1,        S<4, 4, 16>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 16, 1, 16>,               1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    32,   8,    8,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 16, 1, 16>,               1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    16,   4,    4,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 16, 4>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 16, 1, 16>,               1>
+
     // clang-format on
     >;
 
@@ -257,6 +340,41 @@ using device_grouped_conv_bwd_data_xdl_f32_16_16_instances =
         // clang-format on
         >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f32_optimized_loads_instances =
+    std::tuple<
+        // clang-format off
+       // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+       // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+       // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+       // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+       // A K1 one access for each thread per load 
+       // 32x32
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    32,   8,    8,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 4, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    16,   4,    4,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 4, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
+        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    32,   8,    8,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 16, 1, 16>,               2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    16,   4,    4,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 16, 1, 16>,               2>,
+        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    32,   8,    8,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 8, 1, 32>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,  128,    32,    16,   4,    4,  32,   32,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 16, 4>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 8, 1, 32>,                1>,
+        // 16x16      
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    32,   8,    8,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 2, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    16,   4,    4,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 2, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              1,         1,            1,            1,     S<1, 64, 1, 4>,                4>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    32,   8,    8,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 32, 1, 8>,                2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    16,   4,    4,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 4>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         1,            1,            1,     S<1, 32, 1, 8>,                2>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    32,   8,    8,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              4,         1,        S<4, 8, 8>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 16, 1, 16>,               1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,     F32,  Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   64,    16,    16,   4,    4,  16,   16,       1,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 16, 4>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,            1,            1,     S<1, 16, 1, 16>,               1>
+        // clang-format on
+        >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_bilinear_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_bilinear_instance.hpp
index 1c3bfef8ce..416e64b534 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_bilinear_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_bilinear_instance.hpp
@@ -16,6 +16,7 @@ namespace instance {
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using TF32 = ck::tf32_t;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -139,6 +140,40 @@ using device_grouped_conv_fwd_xdl_bilinear_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_bilinear_f32_tf32_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1,     TF32,            TF32>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32, Tuple<F32>,   F32, PassThrough, PassThrough,  Bilinear,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4,     TF32,            TF32>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index bbc2a54c34..0920b3277e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -24,6 +24,7 @@ using BF8 = ck::bf8_t;
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using TF32 = ck::tf32_t;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -205,6 +206,27 @@ using device_grouped_conv_fwd_xdl_f32_comp_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
+using device_grouped_conv_fwd_xdl_f32_tf32_comp_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, TF32, TF32>
+    // clang-format on
+    >;
+
 // double rate mfma instances on gfx950
 template <index_t NDimSpatial,
           typename ALayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
index 82c01a634b..568f0e0dc4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
@@ -16,6 +16,7 @@ namespace instance {
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using TF32 = ck::tf32_t;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index 768fcbada0..3b9a607daf 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -24,6 +24,7 @@ using BF8 = ck::bf8_t;
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using TF32 = ck::tf32_t;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -199,7 +200,7 @@ using device_grouped_conv_fwd_xdl_f16_nchw_instances = std::tuple<
     DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  64,    64,     32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 8, 1, 8>,               1>,
     DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,    64,    32,   8,   8,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 32, 1, 8>,              1>,
     DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  64,    64,     32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 8, 1, 8>,               1>,
-    // 32x32 instance 
+    // 32x32 instance
     DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
     DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
     DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
@@ -252,6 +253,25 @@ using device_grouped_conv_fwd_xdl_f32_generic_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
+using device_grouped_conv_fwd_xdl_f32_tf32_generic_instances = std::tuple<
+    // clang-format off
+    //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| AComputeType| BComputeType|
+    //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|     DATATYPE | DATATYPE    |
+    //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl |
+    //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+    // generic instance
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1,     TF32,            TF32>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -284,7 +304,45 @@ using device_grouped_conv_fwd_xdl_f32_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   192,    16,   4,   4,   32,   32,    2,    3,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
+    // clang-format on
+    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
+using device_grouped_conv_fwd_xdl_f32_tf32_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| AComputeType| BComputeType|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|     DATATYPE | DATATYPE    |
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl |
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1,     TF32,            TF32>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4,     TF32,            TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   192,    16,   4,   4,   32,   32,    2,    3,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4,     TF32,            TF32>
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
index 5a4a011512..1004025173 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
@@ -16,6 +16,7 @@ namespace instance {
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using TF32 = ck::tf32_t;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -99,6 +100,27 @@ using device_grouped_conv_fwd_xdl_large_tensor_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
+using device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1, TF32, TF32>,
+
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4, TF32, TF32>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
index 57bdeddcf9..44ef8a622c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
@@ -24,6 +24,7 @@ using BF8 = ck::bf8_t;
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using TF32 = ck::tf32_t;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -64,7 +65,7 @@ using device_grouped_conv_fwd_xdl_bf16_mem_instances = std::tuple<
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        // Latency friendly 
+        // Latency friendly
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -163,6 +164,41 @@ using device_grouped_conv_fwd_xdl_f32_mem_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
+using device_grouped_conv_fwd_xdl_f32_tf32_mem_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, TF32, TF32>,
+        // Memory friendly
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, TF32, TF32>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
index d07d82e7ee..acea88798e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -16,6 +16,7 @@ namespace instance {
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using TF32 = ck::tf32_t;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -142,6 +143,27 @@ using device_grouped_conv_fwd_xdl_merged_groups_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
+using device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // Instances with NumGroupsPerBatch > 1
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsDataTypes,   F32,  PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, TF32, TF32, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsDataTypes,   F32,  PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, TF32, TF32, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsDataTypes,   F32,  PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, TF32, TF32, LoopScheduler::Default, 32>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scale_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scale_instance.hpp
index f4dfc8f773..cddf65e43a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scale_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scale_instance.hpp
@@ -16,6 +16,7 @@ namespace instance {
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using TF32 = ck::tf32_t;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -139,6 +140,40 @@ using device_grouped_conv_fwd_xdl_scale_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_scale_f32_tf32_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1, TF32, TF32>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4, TF32, TF32>,
+
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     Scale,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4, TF32, TF32>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_ab_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_ab_instance.hpp
index 81e6b73b55..827148df97 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_ab_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_ab_instance.hpp
@@ -16,6 +16,7 @@ namespace instance {
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using TF32 = ck::tf32_t;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -89,7 +90,7 @@ using device_grouped_conv_fwd_xdl_scaleadd_ab_f32_instances = std::tuple<
         //########################################|           |       |       |            |       |                       |                   |        |         |         |      |            |            |                |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, ck::Tuple<>,ELayout,   ck::Tuple<F32,  F32>, ck::Tuple<F32,  F32>,     F32, F32, ck::Tuple<>, F32,      ScaleAdd,   ScaleAdd, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
-        // instances for small conv.K and conv.C        
+        // instances for small conv.K and conv.C
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, ck::Tuple<>,ELayout,   ck::Tuple<F32,  F32>, ck::Tuple<F32,  F32>,     F32, F32, ck::Tuple<>, F32,      ScaleAdd,   ScaleAdd, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, ck::Tuple<>,ELayout,   ck::Tuple<F32,  F32>, ck::Tuple<F32,  F32>,     F32, F32, ck::Tuple<>, F32,      ScaleAdd,   ScaleAdd, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
 
@@ -97,6 +98,27 @@ using device_grouped_conv_fwd_xdl_scaleadd_ab_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_scaleadd_ab_f32_tf32_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E|                  AData|              BData| AccData| CShuffle|       Ds| EData|           A|           B|             CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|                   Type|               Type|    Type| DataType| DataType|  Type| Elementwise| Elementwise|     Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |                       |                   |        |         |         |      |   Operation|   Operation|       Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |                       |                   |        |         |         |      |            |            |                |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, ck::Tuple<>,ELayout,   ck::Tuple<F32,  F32>, ck::Tuple<F32,  F32>,     F32, F32, ck::Tuple<>, F32,      ScaleAdd,   ScaleAdd, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,              1, TF32, TF32>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, ck::Tuple<>,ELayout,   ck::Tuple<F32,  F32>, ck::Tuple<F32,  F32>,     F32, F32, ck::Tuple<>, F32,      ScaleAdd,   ScaleAdd, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1, TF32, TF32>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, ck::Tuple<>,ELayout,   ck::Tuple<F32,  F32>, ck::Tuple<F32,  F32>,     F32, F32, ck::Tuple<>, F32,      ScaleAdd,   ScaleAdd, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,              4, TF32, TF32>,
+
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, ck::Tuple<>,ELayout,   ck::Tuple<F32,  F32>, ck::Tuple<F32,  F32>,     F32, F32, ck::Tuple<>, F32,      ScaleAdd,   ScaleAdd, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4, TF32, TF32>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
index e9ff75a91d..a2f7190ab8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
@@ -111,6 +111,8 @@ struct DeviceOperationInstanceFactory<
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(op_ptrs);
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_optimized_loads_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP32
@@ -121,6 +123,8 @@ struct DeviceOperationInstanceFactory<
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(op_ptrs);
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_optimized_loads_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -132,6 +136,8 @@ struct DeviceOperationInstanceFactory<
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_optimized_loads_instances(
+                        op_ptrs);
                 }
 #endif
             }
@@ -251,6 +257,8 @@ struct DeviceOperationInstanceFactory<
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_optimized_loads_instances(
+                        op_ptrs);
                 }
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
@@ -271,6 +279,8 @@ struct DeviceOperationInstanceFactory<
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_optimized_loads_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -282,6 +292,8 @@ struct DeviceOperationInstanceFactory<
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_optimized_loads_instances(
+                        op_ptrs);
                 }
 #endif
             }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
index c723be0db8..10362a31c4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
@@ -83,6 +83,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instance
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
@@ -112,6 +126,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instance
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
@@ -141,6 +169,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instanc
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -393,6 +435,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_insta
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
@@ -422,6 +478,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_insta
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
@@ -451,6 +521,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_inst
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index 545826650c..e73e8aac1e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -428,20 +428,39 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
         {
 #ifdef CK_ENABLE_FP32
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
-                         is_same_v<BComputeType, float>)
+                         is_same_v<OutDataType, float>)
             {
-                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
-                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(op_ptrs);
-                add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(op_ptrs);
-                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
-                    op_ptrs);
+                if constexpr(is_same_v<AComputeType, BComputeType> && is_same_v<BComputeType, TF32>)
+                {
+                    add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                }
+                else
+                {
+                    add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+                        op_ptrs);
+                }
             }
 #endif
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp
index 43411b0031..e41e1b833b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp
@@ -127,24 +127,44 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
 #ifdef CK_ENABLE_FP32
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
-                         is_same_v<BComputeType, float>)
+                         is_same_v<OutDataType, float>)
             {
-                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
-                    op_ptrs);
+                static_assert(is_same_v<AComputeType, BComputeType>,
+                              "Error: AComputeType and BComputeType should be the same");
+                if constexpr(is_same_v<AComputeType, TF32>)
+                {
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+                        op_ptrs);
+                }
+                else
+                {
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+                        op_ptrs);
+                }
             }
+
 #endif
         }
         // layout NDHWGC/GKZYXC/NDHWGK
@@ -197,24 +217,44 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
 #ifdef CK_ENABLE_FP32
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
-                         is_same_v<BComputeType, float>)
+                         is_same_v<OutDataType, float>)
             {
-                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
-                    op_ptrs);
+                static_assert(is_same_v<AComputeType, BComputeType>,
+                              "Error: AComputeType and BComputeType should be the same");
+                if constexpr(is_same_v<AComputeType, TF32>)
+                {
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                }
+                else
+                {
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+                        op_ptrs);
+                }
             }
+
 #endif
         }
 #endif // CK_USE_XDL
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc
index aaaacb0d18..4678ab6c66 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc
@@ -480,6 +480,22 @@ void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -508,6 +524,22 @@ void add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -522,6 +554,22 @@ void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwg
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -536,6 +584,22 @@ void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_ins
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -550,6 +614,22 @@ void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intr
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -564,6 +644,22 @@ void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inte
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -578,6 +674,22 @@ void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_insta
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -606,6 +718,22 @@ void add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndh
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -620,6 +748,22 @@ void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_nd
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -634,6 +778,22 @@ void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -648,6 +808,22 @@ void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_i
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -662,6 +838,22 @@ void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_i
                                                                 PassThrough,
                                                                 AddClamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 #endif
 
 } // namespace instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bilinear.hpp
index c8375da6e1..08bea2ce45 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bilinear.hpp
@@ -68,6 +68,22 @@ void add_device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instanc
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 Bilinear>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                ck::Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Bilinear,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -137,8 +153,16 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                          is_same_v<OutDataType, float>)
             {
-                add_device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-                    op_ptrs);
+                if constexpr(is_same_v<ComputeType, TF32>)
+                {
+                    add_device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                }
+                else
+                {
+                    add_device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                }
             }
 #endif
 #ifdef CK_ENABLE_FP16
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp
index 28e74e61e4..f2c62564c3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp
@@ -125,23 +125,44 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
 #ifdef CK_ENABLE_FP32
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
-                         is_same_v<BComputeType, float>)
+                         is_same_v<OutDataType, float>)
             {
-                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
-                    op_ptrs);
+                static_assert(is_same_v<AComputeType, BComputeType>,
+                              "Error: AComputeType and BComputeType should be the same");
+                if constexpr(is_same_v<AComputeType, TF32>)
+                {
+                    add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+                        op_ptrs);
+                }
+                else
+                {
+                    add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+                        op_ptrs);
+                }
             }
+
 #endif
         }
         // layout NDHWGC/GKZYXC/NDHWGK
@@ -193,22 +214,42 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
 #ifdef CK_ENABLE_FP32
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
-                         is_same_v<BComputeType, float>)
+                         is_same_v<OutDataType, float>)
             {
-                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
-                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
-                    op_ptrs);
-                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
-                    op_ptrs);
+                static_assert(is_same_v<AComputeType, BComputeType>,
+                              "Error: AComputeType and BComputeType should be the same");
+                if constexpr(is_same_v<AComputeType, TF32>)
+                {
+                    add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                }
+                else
+                {
+                    add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+                        op_ptrs);
+                }
             }
 #endif
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc
index d5a8a5344a..c0c3007651 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc
@@ -480,6 +480,22 @@ void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -508,6 +524,22 @@ void add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -522,6 +554,22 @@ void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -536,6 +584,22 @@ void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -550,6 +614,22 @@ void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_ins
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
@@ -564,6 +644,22 @@ void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_ins
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -578,6 +674,22 @@ void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -606,6 +718,22 @@ void add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -620,6 +748,22 @@ void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -634,6 +778,22 @@ void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_insta
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -648,6 +808,22 @@ void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -662,6 +838,22 @@ void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_
                                                                 PassThrough,
                                                                 Clamp>>>& instances);
 
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 #endif
 
 } // namespace instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
index b830bdce71..91221c2c0c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
@@ -111,6 +111,21 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -281,6 +296,22 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 // grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp
index 5efee69b2f..1c873863b3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp
@@ -132,6 +132,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_insta
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 DynamicUnaryOp>>>& instances);
+
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -159,7 +160,8 @@ template <ck::index_t NumDimSpatial,
           typename WeiDataType,
           typename DDataTypes,
           typename OutDataType,
-          typename ComputeType>
+          typename AComputeType,
+          typename BComputeType = AComputeType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
     NumDimSpatial,
     InLayout,
@@ -173,7 +175,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     ck::tensor_operation::element_wise::PassThrough,
     ck::tensor_operation::element_wise::PassThrough,
     ck::tensor_operation::element_wise::DynamicUnaryOp,
-    ComputeType>>
+    AComputeType,
+    BComputeType>>
 {
     using DeviceOp =
         DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
@@ -188,7 +191,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::DynamicUnaryOp,
-                                        ComputeType>;
+                                        AComputeType,
+                                        BComputeType>;
 
     static auto GetInstances()
     {
@@ -207,7 +211,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
 #ifdef CK_ENABLE_FP16
             if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t>)
             {
                 add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                     op_ptrs);
@@ -244,7 +248,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
 #ifdef CK_ENABLE_FP16
             if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t>)
             {
                 add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances(
                     op_ptrs);
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
index 00351ceefd..ac7a773aff 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
@@ -55,6 +55,21 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -169,6 +184,22 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instan
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 // grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
index bd44116057..68cbc56b41 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
@@ -55,6 +55,21 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -169,6 +184,21 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instan
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 // grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scale.hpp
index c4bc1da57e..d11c80babf 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scale.hpp
@@ -68,6 +68,22 @@ void add_device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 Scale>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                ck::Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Scale,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -137,7 +153,16 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                          is_same_v<OutDataType, float>)
             {
-                add_device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
+                if constexpr(is_same_v<ComputeType, TF32>)
+                {
+                    add_device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+                        op_ptrs);
+                }
+                else
+                {
+                    add_device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                }
             }
 #endif
 #ifdef CK_ENABLE_FP16
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index a3f2515099..a59fcd9d6e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -211,6 +211,22 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -559,6 +575,22 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc
index 5f35ab5a4b..e67d71f8ab 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc
@@ -55,6 +55,22 @@ void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instan
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -120,6 +136,22 @@ void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_ins
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 } // namespace instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
index 9f54c4b633..eedbd1abd0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
@@ -84,6 +84,22 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_insta
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
+
 void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
@@ -176,6 +192,22 @@ void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_in
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances);
 #endif
 
 // grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
index 19600a90f8..9f148618ae 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -77,6 +77,8 @@ void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
                                                     Activation_Mul_Clamp<PassThrough>>>>&
         instances);
 #endif
+
+#ifdef CK_USE_XDL
 // Layout(A, B, C) = [Col, Row, Row]
 void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
@@ -136,6 +138,65 @@ void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
                                                     PassThrough,
                                                     Activation_Mul_Clamp<PassThrough>>>>&
         instances);
+#endif
+
+#ifdef CK_USE_WMMA
+void add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          int8_t,
+                                                          int8_t,
+                                                          Empty_Tuple,
+                                                          int8_t,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Activation_Mul_Clamp<PassThrough>>>>&
+        instances);
+
+void add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          int8_t,
+                                                          int8_t,
+                                                          Empty_Tuple,
+                                                          int8_t,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Activation_Mul_Clamp<PassThrough>>>>&
+        instances);
+
+void add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          int8_t,
+                                                          int8_t,
+                                                          Empty_Tuple,
+                                                          int8_t,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Activation_Mul_Clamp<PassThrough>>>>&
+        instances);
+
+void add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          int8_t,
+                                                          int8_t,
+                                                          Empty_Tuple,
+                                                          int8_t,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Activation_Mul_Clamp<PassThrough>>>>&
+        instances);
+#endif
 
 template <typename ALayout,
           typename BLayout,
@@ -184,7 +245,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 #ifdef DL_KERNELS
                     add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
 #endif
+#ifdef CK_USE_XDL
                     add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
+#endif
                 }
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
@@ -195,7 +258,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 #ifdef DL_KERNELS
                     add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
 #endif
+#ifdef CK_USE_XDL
                     add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
+#endif
                 }
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
@@ -206,7 +271,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 #ifdef DL_KERNELS
                     add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs);
 #endif
+#ifdef CK_USE_XDL
                     add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs);
+#endif
                 }
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
@@ -217,12 +284,117 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 #ifdef DL_KERNELS
                     add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs);
 #endif
+#ifdef CK_USE_XDL
                     add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs);
+#endif
                 }
             }
-
-            return op_ptrs;
         }
+
+#ifdef CK_USE_WMMA
+        using Wrapper =
+            DeviceGemmMultipleDSplitKWrapper<ALayout,
+                                             BLayout,
+                                             Empty_Tuple,
+                                             ELayout,
+                                             ADataType,
+                                             BDataType,
+                                             Empty_Tuple,
+                                             EDataType,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             Activation_Mul_Clamp<Activation>>;
+        auto new_op_ptrs =
+            DeviceOperationInstanceFactory<typename Wrapper::DeviceOp>::GetInstances();
+        for(auto& op_ptr : new_op_ptrs)
+        {
+            op_ptrs.emplace_back(std::make_unique<Wrapper>(std::move(op_ptr)));
+        }
+#endif
+
+        return op_ptrs;
+    }
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename Activation>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleDSplitK<
+    ALayout,
+    BLayout,
+    Empty_Tuple,
+    ELayout,
+    ADataType,
+    BDataType,
+    Empty_Tuple,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    Activation_Mul_Clamp<Activation>>>
+{
+    using DeviceOp = DeviceGemmMultipleDSplitK<ALayout,
+                                               BLayout,
+                                               Empty_Tuple,
+                                               ELayout,
+                                               ADataType,
+                                               BDataType,
+                                               Empty_Tuple,
+                                               EDataType,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               Activation_Mul_Clamp<Activation>>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_WMMA
+        if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                     is_same_v<EDataType, int8_t>)
+        {
+            if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                if constexpr(is_same_v<Activation, PassThrough>)
+                {
+                    add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instances(
+                        op_ptrs);
+                }
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                if constexpr(is_same_v<Activation, PassThrough>)
+                {
+                    add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instances(
+                        op_ptrs);
+                }
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                if constexpr(is_same_v<Activation, PassThrough>)
+                {
+                    add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
+                        op_ptrs);
+                }
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                if constexpr(is_same_v<Activation, PassThrough>)
+                {
+                    add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
+                        op_ptrs);
+                }
+            }
+        }
+#endif
+        return op_ptrs;
     }
 };
 
@@ -230,4 +402,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
\ No newline at end of file
+#endif
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 1eaaa7e6ba..172f6681b8 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -54,7 +54,7 @@ function(add_instance_library INSTANCE_NAME)
             list(REMOVE_ITEM ARGN "${source}")
         endif()
         # Do not build XDL instances if gfx9 targets are not on the target list
-        if(NOT INST_TARGETS MATCHES "gfx9" AND source_name MATCHES "_xdl")
+        if(NOT INST_TARGETS MATCHES "gfx9" AND NOT INST_TARGETS MATCHES "gfx11" AND NOT INST_TARGETS MATCHES "gfx12" AND source_name MATCHES "_xdl")
             message(DEBUG "removing xdl instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
@@ -73,13 +73,13 @@ function(add_instance_library INSTANCE_NAME)
             message(DEBUG "removing mha instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
-        # Do not build XDL gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
+        # Do not build XDL gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94, gfx95 and gfx12
         if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source_name MATCHES "gemm_multiply_multiply" AND source_name MATCHES "_f8_")
+            if(NOT INST_TARGETS MATCHES "gfx94|gfx95|gfx12" AND source_name MATCHES "gemm_multiply_multiply" AND source_name MATCHES "_f8_")
                 message(DEBUG "removing gemm_multiply_multiply_f8 instance ${source} ")
                 list(REMOVE_ITEM ARGN "${source}")
             endif()
-            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source_name MATCHES "gemm_xdl_universal" AND source_name MATCHES "_f8_")
+            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND NOT INST_TARGETS MATCHES "gfx12" AND source_name MATCHES "gemm_xdl_universal" AND source_name MATCHES "_f8_")
                 message(DEBUG "removing gemm_universal_f8 instance ${source} ")
                 list(REMOVE_ITEM ARGN "${source}")
             endif()
@@ -89,11 +89,16 @@ function(add_instance_library INSTANCE_NAME)
             message(DEBUG "removing gemm_universal_f8 instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
-        # Do not build gemm_universal_preshuffle_f8 for any targets except gfx94
-        if(NOT (INST_TARGETS MATCHES "gfx942" OR INST_TARGETS MATCHES "gfx950") AND (source_name MATCHES "gemm_universal_preshuffle" OR source_name MATCHES "gemm_xdl_universal_preshuffle") AND (source_name MATCHES "_f8_f8_f16" OR source_name MATCHES "_f8_f8_bf16"))
+        # Do not build gemm_universal_preshuffle_f8 for any targets except gfx94, gfx95 and gfx12
+        if(NOT (INST_TARGETS MATCHES "gfx942" OR INST_TARGETS MATCHES "gfx950" OR INST_TARGETS MATCHES "gfx12") AND (source_name MATCHES "gemm_universal_preshuffle" OR source_name MATCHES "gemm_xdl_universal_preshuffle") AND (source_name MATCHES "_f8_f8_f16" OR source_name MATCHES "_f8_f8_bf16"))
             message(DEBUG "removing gemm_universal_preshuffle_f8 instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
+        # Only build tf32 instances for gfx942
+        if(NOT INST_TARGETS MATCHES "gfx942" AND source_name MATCHES "_tf32_")
+            message(DEBUG "removing tf32 instance ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
 
     endforeach()
 
@@ -106,7 +111,7 @@ function(add_instance_library INSTANCE_NAME)
 
             set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
             if(source_name MATCHES "_xdl")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx10-3-generic)
             elseif(source_name MATCHES "_wmma")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
             elseif(source_name MATCHES "mha")
@@ -117,32 +122,32 @@ function(add_instance_library INSTANCE_NAME)
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
             endif()
 
-            #only build the fp8 gemm instances for gfx90a if the build argument is set, otherwise only build for gfx942/gfx950
+            #only build the fp8 gemm instances for gfx90a if the build argument is set, otherwise only build for gfx942/gfx950 and gfx1200/gfx1201
             if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
                 if(source_name MATCHES "gemm_xdl_universal" AND source_name MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx10-3-generic gfx11-generic)
                 endif()
                 if(source_name MATCHES "gemm_multiply_multiply" AND source_name MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx10-3-generic gfx11-generic)
                 endif()
                 if(source_name MATCHES "gemm_universal_preshuffle" AND source_name MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx10-3-generic gfx11-generic)
                 endif()
                 if(source_name MATCHES "gemm_xdl_universal_preshuffle" AND source_name MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx10-3-generic gfx11-generic)
                 endif()
             else()
                 if(source_name MATCHES "gemm_xdl_universal" AND source_name MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx10-3-generic gfx11-generic)
                 endif()
                 if(source_name MATCHES "gemm_multiply_multiply" AND source_name MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx10-3-generic gfx11-generic)
                 endif()
                 if(source_name MATCHES "gemm_universal_preshuffle" AND source_name MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx10-3-generic gfx11-generic)
                 endif()
                 if(source_name MATCHES "gemm_xdl_universal_preshuffle" AND source_name MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx10-3-generic gfx11-generic)
                 endif()
             endif()
             if(source_name MATCHES "gemm_wmma_universal" AND source_name MATCHES "f8")
@@ -266,7 +271,7 @@ FOREACH(subdir_path ${dir_list})
             message(DEBUG "Found only dl instances, but DL_KERNELS is not set. Skipping.")
             set(add_inst 0)
         endif()
-        if(("${cmake_instance}" MATCHES "ONLY XDL_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx9"))
+        if(("${cmake_instance}" MATCHES "ONLY XDL_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx9|gfx11|gfx12"))
             message(DEBUG "Found only xdl instances, but gfx9 is not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
@@ -278,33 +283,36 @@ FOREACH(subdir_path ${dir_list})
             message(DEBUG "Found only wmma instances, but gfx11 is not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
-        if(("${cmake_instance}" MATCHES "ONLY XDL_AND_DL_KERNELS") AND (NOT DEFINED DL_KERNELS) AND (NOT INST_TARGETS MATCHES "gfx9"))
+        if(("${cmake_instance}" MATCHES "ONLY XDL_AND_DL_KERNELS") AND (NOT DEFINED DL_KERNELS) AND (NOT INST_TARGETS MATCHES "gfx9|gfx11|gfx12"))
             message(DEBUG "Found only xdl and dl instances, but gfx9 is not on the targets listand DL_KERNELS is not set. Skipping.")
             set(add_inst 0)
         endif()
-	    if(("${cmake_instance}" MATCHES "ONLY XDL_AND_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12") AND (NOT INST_TARGETS MATCHES "gfx9"))
+	    if(("${cmake_instance}" MATCHES "ONLY XDL_AND_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx9|gfx11|gfx12"))
             message(DEBUG "Found only xdl and wmma instances, but gfx11 and gfx9 are not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
-	    if(("${cmake_instance}" MATCHES "XDL_DL_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12") AND (NOT INST_TARGETS MATCHES "gfx9") AND (NOT DEFINED DL_KERNELS))
+	    if(("${cmake_instance}" MATCHES "XDL_DL_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx9|gfx11|gfx12") AND (NOT DEFINED DL_KERNELS))
             message(DEBUG "Found xdl, dl, and wmma instances, but none of those meet the target list. Skipping.")
             set(add_inst 0)
         endif()
-        if(("${cmake_instance}" MATCHES "gemm_multiply_multiply" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94") AND (NOT INST_TARGETS MATCHES "gfx95") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
-            message(DEBUG "Found gemm_multiply_multiply_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
+        if("${cmake_instance}" MATCHES "gemm_multiply_multiply_wp" AND (NOT INST_TARGETS MATCHES "gfx94|gfx95|gfx12"))
+            message(DEBUG "Found gemm_multiply_multiply_wp instances, but gfx94/gfx95/gfx12 not on the target list. Skipping. ${cmake_instance}")
+            set(add_inst 0)
+        elseif("${cmake_instance}" MATCHES "gemm_multiply_multiply" AND (NOT INST_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12"))
+            message(DEBUG "Found gemm_multiply_multiply instances, but gfx94/gfx95/gfx11/gfx12 not on the target list. Skipping. ${cmake_instance}")
             set(add_inst 0)
         endif()
-        if(("${cmake_instance}" MATCHES "gemm_universal_preshuffle" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94") AND (NOT INST_TARGETS MATCHES "gfx95") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
+        if(("${cmake_instance}" MATCHES "gemm_universal_preshuffle" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94|gfx95|gfx12") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
             message(DEBUG "Found gemm_universal_preshuffle_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
             set(add_inst 0)
         endif()
-        if(("${cmake_instance}" MATCHES "gemm_xdl_universal_preshuffle" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94") AND (NOT INST_TARGETS MATCHES "gfx95") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
+        if(("${cmake_instance}" MATCHES "gemm_xdl_universal_preshuffle" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94|gfx95|gfx12") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
             message(DEBUG "Found gemm_xdl_universal_preshuffle_f8_f8_bf16 instances, but gfx94/gfx95 not on the target list. Skipping.")
             set(add_inst 0)
         endif()
         if ("${cmake_instance}" MATCHES "gemm_bilinear")
             set(add_inst 0)
-            if((SUPPORTED_GPU_TARGETS MATCHES "gfx9") AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES))
+            if((SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx11|gfx12") AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES))
                 set(add_inst 1)
             endif()
             if((SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]") AND (DTYPES MATCHES "int8" OR NOT DEFINED DTYPES))
@@ -438,7 +446,7 @@ if(CK_DEVICE_MHA_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY AND BUILD_MHA_LIB)
             add_library(composablekernels::device_mha_operations ALIAS device_mha_operations)
             target_compile_features(device_mha_operations PUBLIC)
             set_target_properties(device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
-            
+
             rocm_install(TARGETS device_mha_operations
                 EXPORT device_mha_operationsTargets)
             rocm_install(EXPORT device_mha_operationsTargets
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
index 2aa607429d..527c40fcd9 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
@@ -1,5 +1,9 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_batched_gemm_gemm_instance
+    device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+    device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instance.cpp
+    device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+    device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
     device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
     device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
new file mode 100644
index 0000000000..d6bea4d36c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = GemmSpecialization::Default;
+static constexpr auto GemmPadded  = GemmSpecialization::MNKOPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto PipeVerV1 = BlockGemmPipelineVersion::v1;
+static constexpr auto PipeVerV3 = BlockGemmPipelineVersion::v3;
+
+// gemm0: Acc[g, m, n] = A[g, m, k] * B0[g, k, n]
+// gemm1: C[g, m, o] = Acc[g, m, n] * B1[g, n, o]
+// Note that in some cases the "m, o, n" dimensions are referred to as the "gemm1 m, n, k"
+// dimensions instead!
+template <GemmSpecialization GemmSpec,
+          BlockGemmPipelineScheduler PipeSched,
+          BlockGemmPipelineVersion PipeVer>
+using device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances =
+    std::
+        tuple<
+            // clang-format off
+        //################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|   BlkGemm|     BlkGemm|
+        //################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| PipeSched| PipelineVer|
+        //################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |            |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |          |            |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |          |            |
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,     16,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 2, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               8,               1,      false,           1,           1,               S<1, 16, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,     16,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 2, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               8,               1,      false,           1,           1,               S<1, 16, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,     32,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 4, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               1,      false,           1,           1,               S<1, 32, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,     32,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 4, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               1,      false,           1,           1,               S<1, 32, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,     64,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 8, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               2,               1,      false,           1,           1,               S<1, 64, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,     64,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 8, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               2,               1,      false,           1,           1,               S<1, 64, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    128,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,    S<2, 128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<2, 16, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               1,               1,      false,           1,           1,              S<1, 128, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    128,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,    S<2, 128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<2, 16, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               1,               1,      false,           1,           1,              S<1, 128, 1, 2>,               8, PipeSched,    PipeVer>
+            // clang-format on
+            >;
+
+void add_device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    // clang-format off
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances<GemmDefault, Intrawave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances<GemmDefault, Interwave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances<GemmDefault, Intrawave, PipeVerV3>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances<GemmPadded, Intrawave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances<GemmPadded, Interwave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances<GemmPadded, Intrawave, PipeVerV3>{});
+    // clang-format on
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instance.cpp
new file mode 100644
index 0000000000..3a38ab0d68
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instance.cpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = GemmSpecialization::Default;
+static constexpr auto GemmPadded  = GemmSpecialization::MNKOPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto PipeVerV1 = BlockGemmPipelineVersion::v1;
+static constexpr auto PipeVerV3 = BlockGemmPipelineVersion::v3;
+
+// gemm0: Acc[g, m, n] = A[g, m, k] * B0[g, k, n]
+// gemm1: C[g, m, o] = Acc[g, m, n] * B1[g, n, o]
+// Note that in some cases the "m, o, n" dimensions are referred to as the "gemm1 m, n, k"
+// dimensions instead!
+template <GemmSpecialization GemmSpec,
+          BlockGemmPipelineScheduler PipeSched,
+          BlockGemmPipelineVersion PipeVer>
+using device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instances =
+    std::
+        tuple<
+            // clang-format off
+        //################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|   BlkGemm|     BlkGemm|
+        //################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| PipeSched| PipelineVer|
+        //################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |            |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |          |            |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |          |            |
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,     16,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 16, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,     16,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 16, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,     32,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 32, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,     32,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 32, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,     64,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 64, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,     64,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 64, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    128,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,    S<2, 128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,              S<1, 128, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,  BF16,   BF16,   BF16,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    128,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,    S<2, 128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,              S<1, 128, 1, 2>,               8, PipeSched,    PipeVer>
+            // clang-format on
+            >;
+
+void add_device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    // clang-format off
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instances<GemmDefault, Intrawave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instances<GemmDefault, Interwave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instances<GemmDefault, Intrawave, PipeVerV3>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instances<GemmPadded, Intrawave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instances<GemmPadded, Interwave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_bf16_bf16_bf16_bf16_gmk_gnk_gon_gmo_instances<GemmPadded, Intrawave, PipeVerV3>{});
+    // clang-format on
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
new file mode 100644
index 0000000000..9155b14da8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = GemmSpecialization::Default;
+static constexpr auto GemmPadded  = GemmSpecialization::MNKOPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto PipeVerV1 = BlockGemmPipelineVersion::v1;
+static constexpr auto PipeVerV3 = BlockGemmPipelineVersion::v3;
+
+// gemm0: Acc[g, m, n] = A[g, m, k] * B0[g, k, n]
+// gemm1: C[g, m, o] = Acc[g, m, n] * B1[g, n, o]
+// Note that in some cases the "m, o, n" dimensions are referred to as the "gemm1 m, n, k"
+// dimensions instead!
+template <GemmSpecialization GemmSpec,
+          BlockGemmPipelineScheduler PipeSched,
+          BlockGemmPipelineVersion PipeVer>
+using device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
+    std::
+        tuple<
+            // clang-format off
+        //################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|   BlkGemm|     BlkGemm|
+        //################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| PipeSched| PipelineVer|
+        //################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |            |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |          |            |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |          |            |
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,    32,     16,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 2, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               8,               1,      false,           1,           1,               S<1, 16, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,    32,     16,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 2, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               8,               1,      false,           1,           1,               S<1, 16, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,    64,     32,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 4, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               1,      false,           1,           1,               S<1, 32, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,    64,     32,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 4, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               1,      false,           1,           1,               S<1, 32, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,   128,     64,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 8, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               2,               1,      false,           1,           1,               S<1, 64, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,   128,     64,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,       S<2, 8, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               2,               1,      false,           1,           1,               S<1, 64, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,   256,    128,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,    S<2, 128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<2, 16, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               1,               1,      false,           1,           1,              S<1, 128, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,   256,    128,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,    S<2, 128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<2, 16, 8>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               1,               1,      false,           1,           1,              S<1, 128, 1, 2>,               8, PipeSched,    PipeVer>
+            // clang-format on
+            >;
+
+void add_device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Row,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    // clang-format off
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<GemmDefault, Intrawave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<GemmDefault, Interwave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<GemmDefault, Intrawave, PipeVerV3>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<GemmPadded, Intrawave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<GemmPadded, Interwave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<GemmPadded, Intrawave, PipeVerV3>{});
+    // clang-format on
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
new file mode 100644
index 0000000000..bd7bcd1be6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = GemmSpecialization::Default;
+static constexpr auto GemmPadded  = GemmSpecialization::MNKOPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto PipeVerV1 = BlockGemmPipelineVersion::v1;
+static constexpr auto PipeVerV3 = BlockGemmPipelineVersion::v3;
+
+// gemm0: Acc[g, m, n] = A[g, m, k] * B0[g, k, n]
+// gemm1: C[g, m, o] = Acc[g, m, n] * B1[g, n, o]
+// Note that in some cases the "m, o, n" dimensions are referred to as the "gemm1 m, n, k"
+// dimensions instead!
+template <GemmSpecialization GemmSpec,
+          BlockGemmPipelineScheduler PipeSched,
+          BlockGemmPipelineVersion PipeVer>
+using device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances =
+    std::
+        tuple<
+            // clang-format off
+        //################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|   BlkGemm|     BlkGemm|
+        //################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| PipeSched| PipelineVer|
+        //################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |            |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |          |            |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |          |            |
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,     16,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 16, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,     16,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 16, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,     32,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 32, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,     32,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<4, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 32, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,     64,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,     S<2, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 64, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,     64,    64,    64,    64,    64,   8,   8,    8,   16,   16,     1,     4,     4,     S<2, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,               S<1, 64, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    128,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,    S<2, 128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,              S<1, 128, 1, 2>,               8, PipeSched,    PipeVer>,
+        DeviceBatchedGemmGemm_Wmma_CShuffleV3<  Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    128,   128,    64,    64,    64,   8,   8,    8,   16,   16,     1,     8,     4,    S<2, 128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,           1,           1,              S<1, 128, 1, 2>,               8, PipeSched,    PipeVer>
+            // clang-format on
+            >;
+
+void add_device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Col,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    // clang-format off
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances<GemmDefault, Intrawave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances<GemmDefault, Interwave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances<GemmDefault, Intrawave, PipeVerV3>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances<GemmPadded, Intrawave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances<GemmPadded, Interwave, PipeVerV1>{});
+    add_device_operation_instances(instances, device_batched_gemm_gemm_wmma_cshuffle_v3_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances<GemmPadded, Intrawave, PipeVerV3>{});
+    // clang-format on
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add/CMakeLists.txt
index 298da1fbef..478e9a8ab8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add/CMakeLists.txt
@@ -1,5 +1,8 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_gemm_add_instance
    device_gemm_add_xdl_c_shuffle_f16_i8_f16_f16_mk_kn_mn_mn_instance.cpp
    device_gemm_add_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instance.cpp
+
+   device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+   device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add/device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add/device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..b3f862f9cd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add/device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+template <GemmSpecialization GemmSpec>
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[m, k], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   BF16,    BF16, BF16_Tuple,   BF16,     F32,      F32, PassThrough, PassThrough, Add,         GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   BF16,    BF16, BF16_Tuple,   BF16,     F32,      F32, PassThrough, PassThrough, Add,         GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   BF16,    BF16, BF16_Tuple,   BF16,     F32,      F32, PassThrough, PassThrough, Add,         GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   BF16,    BF16, BF16_Tuple,   BF16,     F32,      F32, PassThrough, PassThrough, Add,         GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          BF16_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add/device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add/device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..ec8fe54888
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add/device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+template <GemmSpecialization GemmSpec>
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[m, k], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, Add,         GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, Add,         GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, Add,         GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, Add,         GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt
index 04ae90bc5b..ab8023d1ba 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt
@@ -1,5 +1,10 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_gemm_add_add_fastgelu_instance
+   device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+   device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+   device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+   device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+
    device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
    device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
    device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..8c8006cd3d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0, d1)
+// elementwise(c, d0, d1) = fastgelu(c + d0 + d1)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n], d1[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|            CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise|    Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|      Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |               |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddAddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..e2a99fea9e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0, d1)
+// elementwise(c, d0, d1) = fastgelu(c + d0 + d1)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n], d1[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|     DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|            CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |             |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise|    Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |             |        |      |      |              |      |        |         |   Operation|   Operation|      Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |             |        |      |      |              |      |        |         |            |            |               |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddAddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..10dfce38a1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0, d1)
+// elementwise(c, d0, d1) = fastgelu(c + d0 + d1)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n], d1[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|         DsData| EData| AccData| CShuffle|           A|           B|            CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |              |        |  Type|  Type|           Type|  Type|    Type| DataType| Elementwise| Elementwise|    Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |              |        |      |      |               |      |        |         |   Operation|   Operation|      Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |              |        |      |      |               |      |        |         |            |            |               |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,    F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddAddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..5307b44389
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0, d1)
+// elementwise(c, d0, d1) = fastgelu(c + d0 + d1)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n], d1[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|     DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|            CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |             |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise|    Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |             |        |      |      |              |      |        |         |   Operation|   Operation|      Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |             |        |      |      |              |      |        |         |            |            |               |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddAddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddAddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
index 45d6abce01..46f0c3b9c6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
@@ -1,9 +1,14 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_gemm_add_fastgelu_instance
-   device_gemm_add_fastgelu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instance.cpp
-   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
-   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
-   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
-   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
-   device_gemm_add_fastgelu_xdl_c_shuffle_f16_i8_f16_f16_mk_kn_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+
+    device_gemm_add_fastgelu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_xdl_c_shuffle_f16_i8_f16_f16_mk_kn_mn_mn_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..cfae2c4508
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0)
+// elementwise(c, d0) = fastgelu(c + d0)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
new file mode 100644
index 0000000000..00e06c3441
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0)
+// elementwise(c, d0) = fastgelu(c + d0)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..1bc634de38
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0)
+// elementwise(c, d0) = fastgelu(c + d0)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
new file mode 100644
index 0000000000..4a8643d553
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0)
+// elementwise(c, d0) = fastgelu(c + d0)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/CMakeLists.txt
index d859078ca9..2e6bdca234 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/CMakeLists.txt
@@ -1,7 +1,11 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_gemm_add_multiply_instance
    device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
    device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
    device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
    device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+   device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+   device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+   device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+   device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..6c6f354d6f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..a56efff220
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|     DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |             |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |             |        |      |      |              |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |             |        |      |      |              |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..a92f843de4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..6b092fb2a6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|     DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |             |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |             |        |      |      |              |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |             |        |      |      |              |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, AddMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_multiply_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_relu/CMakeLists.txt
index 043bdab001..1bdf611907 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu/CMakeLists.txt
@@ -1,5 +1,8 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_gemm_add_relu_instance
    device_gemm_add_relu_xdl_c_shuffle_f16_i8_f16_f16_mk_kn_mn_mn_instance.cpp
    device_gemm_add_relu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instance.cpp
+   device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instance.cpp
+   device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
 )
+
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu/device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu/device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..35c373a0e7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu/device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+template <GemmSpecialization GemmSpec>
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[m, k], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   BF16,    BF16, BF16_Tuple,   BF16,     F32,      F32, PassThrough, PassThrough, AddRelu,         GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   BF16,    BF16, BF16_Tuple,   BF16,     F32,      F32, PassThrough, PassThrough, AddRelu,         GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   BF16,    BF16, BF16_Tuple,   BF16,     F32,      F32, PassThrough, PassThrough, AddRelu,         GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   BF16,    BF16, BF16_Tuple,   BF16,     F32,      F32, PassThrough, PassThrough, AddRelu,         GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          BF16_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddRelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_relu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu/device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu/device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..794b7f0e3e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu/device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+template <GemmSpecialization GemmSpec>
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[m, k], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddRelu,         GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddRelu,         GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddRelu,         GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddRelu,         GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddRelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_relu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_silu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_silu/CMakeLists.txt
index e6ca64cdc1..565096dd61 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_silu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_silu/CMakeLists.txt
@@ -1,5 +1,6 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_gemm_add_silu_instance
+   device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
    device_gemm_add_silu_xdl_c_shuffle_f16_i8_f16_f16_mk_kn_mn_mn_instance.cpp
    device_gemm_add_silu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_silu/device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_silu/device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..491f25dec8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_silu/device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[m, k], b[k, n], d0[m, n], d1[m, n]
+
+template <GemmSpecialization GemmSpec, typename T_dataType, typename T_tupleDataType>
+using device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|    DsLayout| ELayout|       AData|        BData|              DsData|        EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |            |        |        Type|         Type|                Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |            |        |            |             |                    |             |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |            |        |            |             |                    |             |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row,  Row_Tuple,      Row,   T_dataType,   T_dataType,     T_tupleDataType,   T_dataType,     F32,      F32, PassThrough, PassThrough,     AddSilu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddSilu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmDefault,
+                                                                                  F16,
+                                                                                  F16_Tuple>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmMNKPadding,
+                                                                                  F16,
+                                                                                  F16_Tuple>{});
+}
+
+// implement bf16 with same parameters to avoid duplication
+void add_device_gemm_add_silu_wmma_c_shuffle_bf16_bf16_bf16_bf16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          BF16_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddSilu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmDefault,
+                                                                                  BF16,
+                                                                                  BF16_Tuple>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_silu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmMNKPadding,
+                                                                                  BF16,
+                                                                                  BF16_Tuple>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
index 61892e708c..39e83495d4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
@@ -4,6 +4,10 @@ add_instance_library(device_gemm_bilinear_instance
    device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
    device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
    device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+   device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+   device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+   device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+   device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
    device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp
    device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp
    device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..142c89c80b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e[m, n] = bilinear(a[k, m] * b[k, n], d[m, n])
+template <GemmSpecialization GemmSpec>
+using device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
new file mode 100644
index 0000000000..cbf0fe6563
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e[m, n] = bilinear(a[k, m] * b[k, n], d[m, n])
+template <GemmSpecialization GemmSpec>
+using device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..c8a7a66a93
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e[m, n] = bilinear(a[k, m] * b[k, n], d[m, n])
+template <GemmSpecialization GemmSpec>
+using device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
new file mode 100644
index 0000000000..57ea32b083
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e[m, n] = bilinear(a[k, m] * b[k, n], d[m, n])
+template <GemmSpecialization GemmSpec>
+using device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough,    Bilinear, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_bilinear_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp
index 6a23b70321..a948a59c00 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp
@@ -45,7 +45,7 @@ using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances = st
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32,  I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    64,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32,  I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,    64,    64,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32,  I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,    64,    32,    32,    64,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32,  I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,    32,    16,    16,    64,  16,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
+
         // M/N/K padding
         // N % 16 == 0 && K % 16 == 0
         //################################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|   DsData| EData|            A|           B|         CDE|           GEMM| Prefetch| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -55,7 +55,7 @@ using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances = st
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    64,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    64,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    64,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    64,  16,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
+
         // M/N/K padding
         // N % 8 == 0 && K % 8 == 0
         //################################|      A|      B|        Ds|      E| AData| BData|  AccData| CShuffle|   DsData| EData|           A|           B|         CDE|           GEMM| Prefetch| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -65,7 +65,6 @@ using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances = st
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    64,   8,   16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8>,
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    64,   8,   16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 4>,               8>,
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    64,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 2>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    64,   8,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
         
         // M/N/K padding
         // N % 8 == 0 && K % 8 == 0
@@ -76,7 +75,6 @@ using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances = st
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32,  I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   4,   16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           2,               S<1, 32, 1, 8>,               4>,
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32,  I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    32,   4,   16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           2,               S<1, 32, 1, 4>,               4>,
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32,  I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    32,   4,   16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           2,               S<1, 32, 1, 2>,               4>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32,  I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    32,   4,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 2>,               4>,
         
         // M/N/K padding
         // N % 1 == 0 && K % 8 == 0
@@ -86,8 +84,7 @@ using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances = st
         //################################|       |       |          |       |      |      |        |         |         |      |             |            |            |               |         |      |      |      |      |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    64,   8,   16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               1>,
         DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    64,   8,   16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 4>,               1>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    64,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 2>,               1>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    64,   8,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               1>
+        DeviceGemmMultipleD_Wmma_CShuffle<     Row,    Col, Row_Tuple,    Row,    I8,    I8,     I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    64,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 2>,               1>
 
     // clang-format on
     >;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt
index 2f45401ec6..f3273fb8ed 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt
@@ -1,5 +1,10 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_gemm_fastgelu_instance
+   device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+   device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+   device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+
    device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
    device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
    device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
new file mode 100644
index 0000000000..6bb4f4a0e0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise(a * b)
+// elementwise(c) = fastgelu(c)
+// output: e[m, n]
+// input: a[m, k], b[n, k]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instance = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|    DsLayout| ELayout| AData| BData|      DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |            |        |  Type|  Type|        Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |            |        |      |      |            |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |            |        |      |      |            |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          Empty_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instance<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_kn_mn_instance<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
new file mode 100644
index 0000000000..5e0a9da5a8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise(a * b)
+// elementwise(c) = fastgelu(c)
+// output: e[m, n]
+// input: a[m, k], b[n, k]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|    DsLayout| ELayout| AData| BData|       DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |            |        |  Type|  Type|         Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |            |        |      |      |             |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |            |        |      |      |             |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Col, Empty_Tuple,     Row,   F16,    F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          Empty_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_km_nk_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
new file mode 100644
index 0000000000..cba209b408
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise(a * b)
+// elementwise(c) = fastgelu(c)
+// output: e[m, n]
+// input: a[m, k], b[n, k]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|   DsLayout| ELayout| AData| BData|      DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |           |        |  Type|  Type|        Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |           |        |      |      |            |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |           |        |      |      |            |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,     FastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          Empty_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_kn_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
new file mode 100644
index 0000000000..27d34bfe63
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise(a * b)
+// elementwise(c) = fastgelu(c)
+// output: e[m, n]
+// input: a[m, k], b[n, k]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|    DsLayout| ELayout| AData| BData|      DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|    BlkGemm|     BlkGemm|
+        //##################################|        |        |            |        |  Type|  Type|        Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|  PipeSched| PipelineVer|
+        //##################################|        |        |            |        |      |      |            |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |           |            |
+        //##################################|        |        |            |        |      |      |            |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |           |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Row,     Col, Empty_Tuple,     Row,   F16,   F16, Empty_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,    FastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>
+    // clang-format on
+    >;
+
+void add_device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          Empty_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_wmma_c_shuffle_f16_f16_f16_mk_nk_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/CMakeLists.txt
index 5af7322b1a..5ce585ad81 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/CMakeLists.txt
@@ -1,16 +1,26 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 set(GEMM_MULTI_ABD_INSTANCES)
 
 list(APPEND GEMM_MULTI_ABD_INSTANCES 
-	device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
-	device_gemm_xdl_multi_abd_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
-	device_gemm_xdl_multi_abd_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
-	device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
-	device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
-	device_gemm_xdl_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
-	device_gemm_xdl_multi_abd_multiply_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
-	device_gemm_xdl_multi_abd_multiply_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
-	device_gemm_xdl_multi_abd_multiply_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
-	)
+    device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_wmma_multi_abd_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_wmma_multi_abd_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_wmma_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_wmma_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
+    device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_wmma_multi_abd_multiply_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_wmma_multi_abd_multiply_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_wmma_multi_abd_multiply_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+
+    device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_xdl_multi_abd_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_xdl_multi_abd_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
+    device_gemm_xdl_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_xdl_multi_abd_multiply_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_xdl_multi_abd_multiply_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+    device_gemm_xdl_multi_abd_multiply_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+)
 
 add_instance_library(device_gemm_multi_abd_instance ${GEMM_MULTI_ABD_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp
new file mode 100644
index 0000000000..8d4c45ae82
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = BF16;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using B1Layout = B0Layout;
+using D0Layout = Row;
+using ELayout  = Row;
+
+using Multiply            = ck::tensor_operation::element_wise::Multiply;
+using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
+using MultiplyFastGelu    = ck::tensor_operation::element_wise::MultiplyFastGelu;
+using MultiplyAdd         = ck::tensor_operation::element_wise::MultiplyAdd;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using AElementOp = PassThrough;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding  = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+template <typename BsLayout,
+          typename DsLayout,
+          typename BsDataType,
+          typename DsDataType,
+          typename BElementOp,
+          typename CDEElementOp,
+          ck::tensor_operation::device::GemmSpecialization GemmSpec,
+          BlockGemmPipelineScheduler BlkGemmPipeSched>
+using device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_comp_instances = std::tuple<
+    // clang-format off
+       //###################################|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle|       CBlockTransferClusterLengths|  CBlockTransfer|                       BlkGemmPipeSched|           BlkGemmPipelineVer|
+       //###################################|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|    NRepeat| _MBlock_MPerBlock_NBlock_NPerBlock| ScalarPerVector|                                       |                             |
+       //###################################|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerShuffle| PerShuffle|                                   |                |                                       |                             |
+       //###################################|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |           |           |                                   |                |                                       |                             |
+       DeviceGemmMultipleABD_Wmma_CShuffleV3< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   256,   256,    32,   8,   8,   16,   16,       8,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         0,          1,          1,                     S<1, 32, 1, 8>,      S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+       DeviceGemmMultipleABD_Wmma_CShuffleV3< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         0,          1,          1,                     S<1, 32, 1, 8>,      S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+       DeviceGemmMultipleABD_Wmma_CShuffleV3< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   128,   256,    32,   8,   8,   16,   16,       4,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         0,          1,          1,                     S<1, 32, 1, 8>,      S<8, 8, 8>,                       BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+       DeviceGemmMultipleABD_Wmma_CShuffleV3< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         0,          1,          1,                     S<1, 32, 1, 8>,      S<8, 8, 8>,                       BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
+template <typename BsLayout,
+          typename DsLayout,
+          typename BsDataType,
+          typename DsDataType,
+          typename BElementOp,
+          typename CDEElementOp,
+          ck::tensor_operation::device::GemmSpecialization GemmSpec,
+          BlockGemmPipelineScheduler BlkGemmPipeSched>
+using device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_mem_instances = std::tuple<
+    // clang-format off
+        //###################################|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|       CBlockTransferClusterLengths|  CBlockTransfer|  BlkGemmPipeSched|           BlkGemmPipelineVer|
+        //###################################|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat| _MBlock_MPerBlock_NBlock_NPerBlock| ScalarPerVector|                  |                             |
+        //###################################|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                   |                |                  |                             |
+        //###################################|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                   |                |                  |                             |
+        DeviceGemmMultipleABD_Wmma_CShuffleV3< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,    S< 4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         0,           1,           1,                     S<1, 32, 1, 2>,      S<8, 8, 8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemmMultipleABD_Wmma_CShuffleV3< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,    32,    16,    16,   256,   8,   8,   16,   16,       1,       1,    S<32,  1, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<32,  1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         0,           1,           1,                     S<1, 16, 1, 2>,      S<8, 8, 8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemmMultipleABD_Wmma_CShuffleV3< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,    64,    16,    32,   256,   8,   8,   16,   16,       1,       1,    S<32,  2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<32,  2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         0,           1,           1,                     S<1, 16, 1, 4>,      S<8, 8, 8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
new file mode 100644
index 0000000000..eef450533b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            ck::Tuple<B0Layout, B1Layout>,
+                                                            ck::Tuple<>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            ck::Tuple<B0DataType, B1DataType>,
+                                                            ck::Tuple<>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            Multiply,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_comp_instances<
+                                       ck::Tuple<B0Layout, B1Layout>,
+                                       ck::Tuple<>,
+                                       ck::Tuple<B0DataType, B1DataType>,
+                                       ck::Tuple<>,
+                                       Multiply,
+                                       PassThrough,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_mem_instances<
+                                       ck::Tuple<B0Layout, B1Layout>,
+                                       ck::Tuple<>,
+                                       ck::Tuple<B0DataType, B1DataType>,
+                                       ck::Tuple<>,
+                                       Multiply,
+                                       PassThrough,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp
new file mode 100644
index 0000000000..0c2a34fbf8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using D0DataType       = BF16;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Col;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using D0Layout = Row;
+using ELayout  = Row;
+
+using Multiply    = ck::tensor_operation::element_wise::Multiply;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using AElementOp = PassThrough;
+using BElementOp = Multiply;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+// using CDEElementOp = AddFastGelu;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding  = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+template <typename DsLayout,
+          typename DsDataType,
+          typename CDEElementOp,
+          ck::tensor_operation::device::GemmSpecialization GemmSpec,
+          BlockGemmPipelineScheduler BlkGemmPipeSched>
+using device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
+        //###################################|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|       CBlockTransferClusterLengths|  CBlockTransfer|                       BlkGemmPipeSched|           BlkGemmPipelineVer|
+        //###################################|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat| _MBlock_MPerBlock_NBlock_NPerBlock| ScalarPerVector|                                       |                             |
+        //###################################|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                   |                |                                       |                             |
+        //###################################|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                   |                |                                       |                             |
+        DeviceGemmMultipleABD_Wmma_CShuffleV3< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           1,                     S<1, 32, 1, 8>,      S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemmMultipleABD_Wmma_CShuffleV3< AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           1,                     S<1, 32, 1, 8>,      S<8, 8, 8>,                       BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
new file mode 100644
index 0000000000..30ab4135d9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            ck::Tuple<B0Layout, B1Layout>,
+                                                            ck::Tuple<D0Layout>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            ck::Tuple<B0DataType, B1DataType>,
+                                                            ck::Tuple<D0DataType>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            Multiply,
+                                                            Add>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_comp_instances<
+                                       ck::Tuple<B0Layout, B1Layout>,
+                                       ck::Tuple<D0Layout>,
+                                       ck::Tuple<B0DataType, B1DataType>,
+                                       ck::Tuple<D0DataType>,
+                                       Multiply,
+                                       Add,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_mem_instances<
+                                       ck::Tuple<B0Layout, B1Layout>,
+                                       ck::Tuple<D0Layout>,
+                                       ck::Tuple<B0DataType, B1DataType>,
+                                       ck::Tuple<D0DataType>,
+                                       Multiply,
+                                       Add,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
new file mode 100644
index 0000000000..56d30f9ad2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            ck::Tuple<B0Layout, B1Layout>,
+                                                            ck::Tuple<D0Layout>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            ck::Tuple<B0DataType, B1DataType>,
+                                                            ck::Tuple<D0DataType>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            Multiply,
+                                                            AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_comp_instances<
+                                       ck::Tuple<B0Layout, B1Layout>,
+                                       ck::Tuple<D0Layout>,
+                                       ck::Tuple<B0DataType, B1DataType>,
+                                       ck::Tuple<D0DataType>,
+                                       Multiply,
+                                       AddFastGelu,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_mem_instances<
+                                       ck::Tuple<B0Layout, B1Layout>,
+                                       ck::Tuple<D0Layout>,
+                                       ck::Tuple<B0DataType, B1DataType>,
+                                       ck::Tuple<D0DataType>,
+                                       Multiply,
+                                       AddFastGelu,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
new file mode 100644
index 0000000000..d4b9054a73
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            BsLayout,
+                                                            ck::Tuple<D0Layout>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            BsDataType,
+                                                            ck::Tuple<D0DataType>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            BElementOp,
+                                                            AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_comp_instances<ck::Tuple<D0Layout>,
+                                                                        ck::Tuple<D0DataType>,
+                                                                        AddFastGelu,
+                                                                        GemmMNKPadding,
+                                                                        Interwave>{});
+}
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            BsLayout,
+                                                            ck::Tuple<D0Layout>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            BsDataType,
+                                                            ck::Tuple<D0DataType>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            BElementOp,
+                                                            Add>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_comp_instances<ck::Tuple<D0Layout>,
+                                                                        ck::Tuple<D0DataType>,
+                                                                        Add,
+                                                                        GemmMNKPadding,
+                                                                        Interwave>{});
+}
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            BsLayout,
+                                                            ck::Tuple<>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            BsDataType,
+                                                            ck::Tuple<>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            BElementOp,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_comp_instances<ck::Tuple<>,
+                                                                        ck::Tuple<>,
+                                                                        PassThrough,
+                                                                        GemmMNKPadding,
+                                                                        Interwave>{});
+}
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            BsLayout,
+                                                            ck::Tuple<>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            BsDataType,
+                                                            ck::Tuple<>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            BElementOp,
+                                                            FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_nk_mn_comp_instances<ck::Tuple<>,
+                                                                        ck::Tuple<>,
+                                                                        FastGelu,
+                                                                        GemmMNKPadding,
+                                                                        Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
new file mode 100644
index 0000000000..cfeaad1a66
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            ck::Tuple<B0Layout, B1Layout>,
+                                                            ck::Tuple<>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            ck::Tuple<B0DataType, B1DataType>,
+                                                            ck::Tuple<>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            Multiply,
+                                                            FastGelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_comp_instances<
+                                       ck::Tuple<B0Layout, B1Layout>,
+                                       ck::Tuple<>,
+                                       ck::Tuple<B0DataType, B1DataType>,
+                                       ck::Tuple<>,
+                                       Multiply,
+                                       FastGelu,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_mem_instances<
+                                       ck::Tuple<B0Layout, B1Layout>,
+                                       ck::Tuple<>,
+                                       ck::Tuple<B0DataType, B1DataType>,
+                                       ck::Tuple<>,
+                                       Multiply,
+                                       FastGelu,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
new file mode 100644
index 0000000000..fe36c30e75
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            ck::Tuple<B0Layout>,
+                                                            ck::Tuple<B1Layout>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            ck::Tuple<B0DataType>,
+                                                            ck::Tuple<B1DataType>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            PassThrough,
+                                                            Multiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<B0Layout>,
+                                                                        ck::Tuple<B1Layout>,
+                                                                        ck::Tuple<B0DataType>,
+                                                                        ck::Tuple<B1DataType>,
+                                                                        PassThrough,
+                                                                        Multiply,
+                                                                        GemmMNKPadding,
+                                                                        Interwave>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<B0Layout>,
+                                                                       ck::Tuple<B1Layout>,
+                                                                       ck::Tuple<B0DataType>,
+                                                                       ck::Tuple<B1DataType>,
+                                                                       PassThrough,
+                                                                       Multiply,
+                                                                       GemmMNKPadding,
+                                                                       Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
new file mode 100644
index 0000000000..69b0e6ff0b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_bias_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_bias_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            ck::Tuple<B0Layout>,
+                                                            ck::Tuple<D0Layout, B1Layout>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            ck::Tuple<B0DataType>,
+                                                            ck::Tuple<D0DataType, B1DataType>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            PassThrough,
+                                                            MultiplyAdd>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_comp_instances<
+                                       ck::Tuple<B0Layout>,
+                                       ck::Tuple<D0Layout, B1Layout>,
+                                       ck::Tuple<B0DataType>,
+                                       ck::Tuple<D0DataType, B1DataType>,
+                                       PassThrough,
+                                       MultiplyAdd,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_mem_instances<
+                                       ck::Tuple<B0Layout>,
+                                       ck::Tuple<D0Layout, B1Layout>,
+                                       ck::Tuple<B0DataType>,
+                                       ck::Tuple<D0DataType, B1DataType>,
+                                       PassThrough,
+                                       MultiplyAdd,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
new file mode 100644
index 0000000000..a779f27f62
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            ck::Tuple<B0Layout>,
+                                                            ck::Tuple<D0Layout, B1Layout>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            ck::Tuple<B0DataType>,
+                                                            ck::Tuple<D0DataType, B1DataType>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            PassThrough,
+                                                            MultiplyAddFastGelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_comp_instances<
+                                       ck::Tuple<B0Layout>,
+                                       ck::Tuple<D0Layout, B1Layout>,
+                                       ck::Tuple<B0DataType>,
+                                       ck::Tuple<D0DataType, B1DataType>,
+                                       PassThrough,
+                                       MultiplyAddFastGelu,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_mem_instances<
+                                       ck::Tuple<B0Layout>,
+                                       ck::Tuple<D0Layout, B1Layout>,
+                                       ck::Tuple<B0DataType>,
+                                       ck::Tuple<D0DataType, B1DataType>,
+                                       PassThrough,
+                                       MultiplyAddFastGelu,
+                                       GemmMNKPadding,
+                                       Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
new file mode 100644
index 0000000000..dec51f72aa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_wmma_multi_abd_multiply_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_multi_abd_multiply_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleABDSplitK<AsLayout,
+                                                            ck::Tuple<B0Layout>,
+                                                            ck::Tuple<B1Layout>,
+                                                            ELayout,
+                                                            AsDataType,
+                                                            ck::Tuple<B0DataType>,
+                                                            ck::Tuple<B1DataType>,
+                                                            EDataType,
+                                                            AElementOp,
+                                                            PassThrough,
+                                                            MultiplyFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<B0Layout>,
+                                                                        ck::Tuple<B1Layout>,
+                                                                        ck::Tuple<B0DataType>,
+                                                                        ck::Tuple<B1DataType>,
+                                                                        PassThrough,
+                                                                        MultiplyFastGelu,
+                                                                        GemmMNKPadding,
+                                                                        Interwave>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_multi_abd_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<B0Layout>,
+                                                                       ck::Tuple<B1Layout>,
+                                                                       ck::Tuple<B0DataType>,
+                                                                       ck::Tuple<B1DataType>,
+                                                                       PassThrough,
+                                                                       MultiplyFastGelu,
+                                                                       GemmMNKPadding,
+                                                                       Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt
index aba9806a74..3a27e43dd6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt
@@ -1,7 +1,11 @@
-# ONLY XDL_KERNELS
-set(GEMM_MULTIPLY_ADD_INSTANCES)
-list(APPEND GEMM_MULTIPLY_ADD_INSTANCES device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
-                                        device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
-                                        device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp
-                                        device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp)
-add_instance_library(device_gemm_multiply_add_instance ${GEMM_MULTIPLY_ADD_INSTANCES})
+# ONLY XDL_AND_WMMA_KERNELS
+add_instance_library(device_gemm_multiply_add_instance 
+    device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+    device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+    device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp
+    device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp
+    device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+    device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+    device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp
+    device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..6ab8f44026
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+using device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|          CDE|      GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|              |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|    Operation|              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |             |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyAdd>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..30d40d7002
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+using device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|         CDE|       GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|               |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,   F16, F16_F16_Tuple,   F16,     F32,      F16, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyAdd>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_add_wmma_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..137e67df25
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+using device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|          CDE|      GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|              |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|    Operation|              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |             |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F8,
+                                                          F32_F32_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyAdd>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp
new file mode 100644
index 0000000000..933af4c40d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+using device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|         CDE|       GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|               |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Row_Tuple,     Row,   F16,    F8, F32_F32_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, MultiplyAdd, GemmMNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F8,
+                                                          F32_F32_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyAdd>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_add_wmma_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt
index 6336833c3a..0e52eac0bf 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt
@@ -1,4 +1,4 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 set(GEMM_MULTIPLY_MULTIPLY_INSTANCES)
 
 list(APPEND GEMM_MULTIPLY_MULTIPLY_INSTANCES 
@@ -38,6 +38,11 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_INSTANCES
         device_gemm_multiply_multiply_xdl_i8_i8_f16/device_gemm_multiply_multiply_xdl_i8_i8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
         device_gemm_multiply_multiply_xdl_i8_i8_f16/device_gemm_multiply_multiply_xdl_i8_i8_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_multiply_multiply_xdl_i8_i8_f16/device_gemm_multiply_multiply_xdl_i8_i8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+
+        device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_f16_mk_nk_mn.cpp
+        device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_bf16_mk_nk_mn.cpp
+        device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_f16_mk_nk_mn.cpp
+        device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_bf16_mk_nk_mn.cpp
         )
 
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_instance_part1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_bf16_mk_nk_mn.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_bf16_mk_nk_mn.cpp
new file mode 100644
index 0000000000..bafbe66e4b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_bf16_mk_nk_mn.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_wmma_f8_f8_bf16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|              CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|    BlkGemm|     BlkGemm| Compute| Compute|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise|      Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|  PipeSched| PipelineVer|   TypeA|   TypeB|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|        Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |           |            |        |        |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |                 |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |           |            |        |        |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Interwave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Interwave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<1, 1, 1>,  Interwave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Interwave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V3,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V3,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,   BF16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Intrawave,          V3,      F8,      F8>
+    // clang-format on
+    >;
+
+void add_device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_bf16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Col_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          F32_F32_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_multiply_multiply_wmma_f8_f8_bf16_mk_nk_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_wmma_f8_f8_bf16_mk_nk_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_f16_mk_nk_mn.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_f16_mk_nk_mn.cpp
new file mode 100644
index 0000000000..fc96eee74b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_f16_mk_nk_mn.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_wmma_f8_f8_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|              CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|    BlkGemm|     BlkGemm| Compute| Compute|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise|      Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|  PipeSched| PipelineVer|   TypeA|   TypeB|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|        Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |           |            |        |        |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |                 |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |           |            |        |        |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Interwave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Interwave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<1, 1, 1>,  Interwave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Interwave,          V1,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V3,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V3,      F8,      F8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    F8,    F8, F32_F32_Tuple,    F16,    F32,       F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Intrawave,          V3,      F8,      F8>
+    // clang-format on
+    >;
+
+void add_device_gemm_multiply_multiply_wmma_c_shuffle_f8_f8_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Col_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          F32_F32_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_multiply_multiply_wmma_f8_f8_f16_mk_nk_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_wmma_f8_f8_f16_mk_nk_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_bf16_mk_nk_mn.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_bf16_mk_nk_mn.cpp
new file mode 100644
index 0000000000..2397c1a760
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_bf16_mk_nk_mn.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_wmma_i8_i8_bf16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|              CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|    BlkGemm|     BlkGemm| Compute| Compute|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise|      Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|  PipeSched| PipelineVer|   TypeA|   TypeB|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|        Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |           |            |        |        |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |                 |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |           |            |        |        |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Interwave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Interwave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<1, 1, 1>,  Interwave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Interwave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V3,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V3,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F32_F32_Tuple,   BF16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Intrawave,          V3,      I8,      I8>
+    // clang-format on
+    >;
+
+void add_device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_bf16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Col_Tuple,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          F32_F32_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_multiply_multiply_wmma_i8_i8_bf16_mk_nk_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_wmma_i8_i8_bf16_mk_nk_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_f16_mk_nk_mn.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_f16_mk_nk_mn.cpp
new file mode 100644
index 0000000000..5cc13884dd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_f16_mk_nk_mn.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_wmma_i8_i8_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout|      DsLayout| ELayout| AData| BData|        DsData| EData| AccData| CShuffle|           A|           B|              CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|    BlkGemm|     BlkGemm| Compute| Compute|
+        //##################################|        |        |              |        |  Type|  Type|          Type|  Type|    Type| DataType| Elementwise| Elementwise|      Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|  PipeSched| PipelineVer|   TypeA|   TypeB|
+        //##################################|        |        |              |        |      |      |              |      |        |         |   Operation|   Operation|        Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |           |            |        |        |
+        //##################################|        |        |              |        |      |      |              |      |        |         |            |            |                 |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |           |            |        |        |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Intrawave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Interwave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Interwave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<1, 1, 1>,  Interwave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Interwave,          V1,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V3,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<1, 1, 1>,  Intrawave,          V3,      I8,      I8>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Col_Tuple,     Row,    I8,    I8, F16_F16_Tuple,    F16,    I32,       I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<1, 1, 1>,  Intrawave,          V3,      I8,      I8>
+    // clang-format on
+    >;
+
+void add_device_gemm_multiply_multiply_wmma_c_shuffle_i8_i8_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Col_Tuple,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          F16_F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_multiply_multiply_wmma_i8_i8_f16_mk_nk_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_wmma_i8_i8_f16_mk_nk_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/CMakeLists.txt
index 07263528b9..142ace2e42 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/CMakeLists.txt
@@ -1,6 +1,7 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 set(GEMM_UNIVERSAL_REDUCE_INSTANCES)
 
+# XDL instances
 list(APPEND GEMM_UNIVERSAL_REDUCE_INSTANCES
         device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -30,4 +31,11 @@ list(APPEND GEMM_UNIVERSAL_REDUCE_INSTANCES
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
         )
 
+# WMMA instances
+list(APPEND GEMM_UNIVERSAL_REDUCE_INSTANCES
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_i8_bf16/device_gemm_wmma_universal_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
+        )
+
 add_instance_library(device_gemm_universal_reduce_instance ${GEMM_UNIVERSAL_REDUCE_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..ee94046b8d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+using DsLayout   = ck::Tuple<>;
+using DsDataType = ck::Tuple<>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_gemm_wmma_universal_reduce_bf16_bf16_bf16_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout|  DsLayout| CLayout| AData| BData|      DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPerWmma|NPerWmma|MRepeat|NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|  CShuffle|  CShuffle|         CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|                Block-wiseGemm|      Reduce|
+        //#########################|        |        |          |        |  Type|  Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    |        |        |       |       |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|MRepeatPer|NRepeatPer|     _MBlock_MRepeatPerShuffle_MWaveM| ScalarPerVector|                               Pipeline|                      Pipeline|    DataType|
+        //#########################|        |        |          |        |      |      |            |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |        |        |       |       | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | Shuffle  | Shuffle  |  PerShuffle_NBlock_NRepeatPerShuffle|   _NPerBlock   |                              Scheduler|                       Version|            |
+        //#########################|        |        |          |        |      |      |            |      |        |         |            |            |            |              |      |      |      |      |    |    |        |        |       |       |                |               |               |               |               |               |          |                |               |               |              |               |               |          |          |          | _NWaveNPerRepeat                    |                |                                       |                              |            |
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    32,   8,   8,      16,      16,      4,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   8,      16,      16,      4,      4,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,    32,   8,   8,      16,      16,      8,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   8,      16,      16,      4,      2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    64,   8,   8,      16,      16,      4,      4,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    64,    64,    32,   8,   8,      16,      16,      2,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,     true,          1,         1,                       S<1, 32, 1, 2>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,    64,    64,   8,   8,      16,      16,      4,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   8,      16,      16,      4,      4,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   8,      16,      16,      4,      2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,   128,    32,   8,   8,      16,      16,      4,      4,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 4>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    32,   8,   8,      16,      16,      4,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    32,   8,   8,      16,      16,      4,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   8,      16,      16,      4,      4,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,    32,   8,   8,      16,      16,      8,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   8,      16,      16,      4,      2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    64,   8,   8,      16,      16,      4,      4,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    64,    64,    32,   8,   8,      16,      16,      2,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,     true,          1,         1,                       S<1, 32, 1, 2>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,    64,    64,   8,   8,      16,      16,      4,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>
+        // clang-format on
+        >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..20d88e4740
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16        = bhalf_t;
+using Row         = tensor_layout::gemm::RowMajor;
+using PassThrough = element_wise::PassThrough;
+
+void add_device_gemm_wmma_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               BF16,
+                                               BF16,
+                                               DsDataType,
+                                               BF16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    if(ck::is_gfx12_supported())
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_bf16_bf16_bf16_mk_kn_mn_instances<GemmDefault,
+                                                                                DsLayout,
+                                                                                DsDataType>{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_bf16_bf16_bf16_mk_kn_mn_instances<GemmKPadding,
+                                                                                DsLayout,
+                                                                                DsDataType>{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_bf16_bf16_bf16_mk_kn_mn_instances<GemmMNPadding,
+                                                                                DsLayout,
+                                                                                DsDataType>{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_bf16_bf16_bf16_mk_kn_mn_instances<GemmMNKPadding,
+                                                                                DsLayout,
+                                                                                DsDataType>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_i8_bf16/device_gemm_wmma_universal_bf16_i8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_i8_bf16/device_gemm_wmma_universal_bf16_i8_bf16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..3ddeec3c02
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_i8_bf16/device_gemm_wmma_universal_bf16_i8_bf16_mk_kn_mn.hpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I8   = int8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+using DsLayout   = ck::Tuple<>;
+using DsDataType = ck::Tuple<>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_gemm_wmma_universal_reduce_bf16_i8_bf16_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout|  DsLayout| CLayout| AData| BData|      DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPerWmma|NPerWmma|MRepeat|NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|  CShuffle|  CShuffle|         CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|      Reduce|
+        //#########################|        |        |          |        |  Type|  Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    |        |        |       |       |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|MRepeatPer|NRepeatPer|     _MBlock_MRepeatPerShuffle_MWaveM| ScalarPerVector|                               Pipeline|                     Pipeline|    DataType|
+        //#########################|        |        |          |        |      |      |            |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |        |        |       |       | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | Shuffle  | Shuffle  |  PerShuffle_NBlock_NRepeatPerShuffle|   _NPerBlock   |                              Scheduler|                      Version|            |
+        //#########################|        |        |          |        |      |      |            |      |        |         |            |            |            |              |      |      |      |      |    |    |        |        |       |       |                |               |               |               |               |               |          |                |               |               |              |               |               |          |          |          | _NWaveNPerRepeat                    |                |                                       |                             |            |
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    32,   8,   4,      16,      16,      4,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   4,      16,      16,      4,      4,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,    32,   8,   4,      16,      16,      8,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,      16,      16,      4,      2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    64,   8,   4,      16,      16,      4,      4,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    64,    64,    32,   8,   4,      16,      16,      2,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,     true,          1,         1,                       S<1, 32, 1, 2>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,    64,    64,   8,   4,      16,      16,      4,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   4,      16,      16,      4,      4,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,      16,      16,      4,      2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,   128,    32,   8,   4,      16,      16,      4,      4,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 4>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    32,   8,   4,      16,      16,      4,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    32,   8,   4,      16,      16,      4,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   4,      16,      16,      4,      4,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,    32,   8,   4,      16,      16,      8,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,      16,      16,      4,      2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    64,   8,   4,      16,      16,      4,      4,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    64,    64,    32,   8,   4,      16,      16,      2,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,     true,          1,         1,                       S<1, 32, 1, 2>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,  BF16,    I8, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,    64,    64,   8,   4,      16,      16,      4,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,     true,          1,         1,                       S<1, 32, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>
+        // clang-format on
+        >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_i8_bf16/device_gemm_wmma_universal_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_i8_bf16/device_gemm_wmma_universal_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..52589a258f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_bf16_i8_bf16/device_gemm_wmma_universal_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_i8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I8          = int8_t;
+using BF16        = bhalf_t;
+using Row         = tensor_layout::gemm::RowMajor;
+using PassThrough = element_wise::PassThrough;
+
+void add_device_gemm_wmma_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout,
+                                               Row,
+                                               BF16,
+                                               I8,
+                                               DsDataType,
+                                               BF16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    if(ck::is_gfx12_supported())
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_bf16_i8_bf16_mk_kn_mn_instances<GemmDefault,
+                                                                              DsLayout,
+                                                                              DsDataType>{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_bf16_i8_bf16_mk_kn_mn_instances<GemmKPadding,
+                                                                              DsLayout,
+                                                                              DsDataType>{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_bf16_i8_bf16_mk_kn_mn_instances<GemmMNPadding,
+                                                                              DsLayout,
+                                                                              DsDataType>{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_bf16_i8_bf16_mk_kn_mn_instances<GemmMNKPadding,
+                                                                              DsLayout,
+                                                                              DsDataType>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..564b81496d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+using DsLayout   = ck::Tuple<>;
+using DsDataType = ck::Tuple<>;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_gemm_wmma_universal_reduce_f16_f16_f16_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout|  DsLayout| CLayout| AData| BData|      DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPerWmma|NPerWmma|MRepeat|NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|  CShuffle|  CShuffle|         CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|                Block-wiseGemm|      Reduce|
+        //#########################|        |        |          |        |  Type|  Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    |        |        |       |       |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|MRepeatPer|NRepeatPer|     _MBlock_MRepeatPerShuffle_MWaveM| ScalarPerVector|                               Pipeline|                      Pipeline|    DataType|
+        //#########################|        |        |          |        |      |      |            |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |        |        |       |       | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | Shuffle  | Shuffle  |  PerShuffle_NBlock_NRepeatPerShuffle|   _NPerBlock   |                              Scheduler|                       Version|            |
+        //#########################|        |        |          |        |      |      |            |      |        |         |            |            |            |              |      |      |      |      |    |    |        |        |       |       |                |               |               |               |               |               |          |                |               |               |              |               |               |          |          |          | _NWaveNPerRepeat                    |                |                                       |                              |            |
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    32,   8,   8,      16,      16,      4,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   8,      16,      16,      4,      4,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,    32,   8,   8,      16,      16,      8,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   8,      16,      16,      4,      2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    64,   8,   8,      16,      16,      4,      4,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    64,    64,    32,   8,   8,      16,      16,      2,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,     true,          1,         1,                       S<1, 32, 1, 2>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,    64,    64,   8,   8,      16,      16,      4,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   8,      16,      16,      4,      4,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   8,      16,      16,      4,      2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,   128,    32,   8,   8,      16,      16,      4,      4,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 4>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    32,   8,   8,      16,      16,      4,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave,  BlockGemmPipelineVersion::v1,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    32,   8,   8,      16,      16,      4,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   8,      16,      16,      4,      4,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,    32,   8,   8,      16,      16,      8,      2,     S<4, 64, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   8,      16,      16,      4,      2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    64,   8,   8,      16,      16,      4,      4,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    64,    64,    32,   8,   8,      16,      16,      2,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,     true,          1,         1,                       S<1, 32, 1, 2>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>,
+        DeviceGemm_Wmma_CShuffleV3R1<    Row,     Row,  DsLayout,     Row,   F16,   F16,  DsDataType,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,    64,    64,   8,   8,      16,      16,      4,      2,     S<4, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              8,       true,    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,     true,          1,         1,                       S<1, 32, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave,  BlockGemmPipelineVersion::v3,      float>
+        // clang-format on
+        >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..3663ee6529
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16         = half_t;
+using Row         = tensor_layout::gemm::RowMajor;
+using PassThrough = element_wise::PassThrough;
+using Add         = element_wise::Add;
+
+using DsLayout_F16   = ck::Tuple<>;
+using DsDataType_F16 = ck::Tuple<>;
+
+void add_device_gemm_wmma_universal_reduce_f16_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2R1<Row,
+                                               Row,
+                                               DsLayout_F16,
+                                               Row,
+                                               F16,
+                                               F16,
+                                               DsDataType_F16,
+                                               F16,
+                                               PassThrough,
+                                               PassThrough,
+                                               PassThrough>>>& instances)
+{
+    if(ck::is_gfx12_supported())
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_f16_f16_f16_mk_kn_mn_instances<GemmDefault,
+                                                                             DsLayout_F16,
+                                                                             DsDataType_F16>{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_f16_f16_f16_mk_kn_mn_instances<GemmKPadding,
+                                                                             DsLayout_F16,
+                                                                             DsDataType_F16>{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_f16_f16_f16_mk_kn_mn_instances<GemmMNPadding>{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_wmma_universal_reduce_f16_f16_f16_mk_kn_mn_instances<GemmMNKPadding>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
index 0ef09c55ee..2598325d62 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
@@ -10,6 +10,9 @@ add_instance_library(
 	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_optimized_loads_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_optimized_loads_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_optimized_loads_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_optimized_loads_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_optimized_loads_instance.cpp
new file mode 100644
index 0000000000..ff4ce04949
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_optimized_loads_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_optimized_loads_instances<2,
+                                                                        NHWGK,
+                                                                        GKYXC,
+                                                                        Empty_Tuple,
+                                                                        NHWGC,
+                                                                        ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_data_xdl_bf16_optimized_loads_instances<
+                                       2,
+                                       NHWGK,
+                                       GKYXC,
+                                       Empty_Tuple,
+                                       NHWGC,
+                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_optimized_loads_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_optimized_loads_instance.cpp
new file mode 100644
index 0000000000..69f70c81a9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_optimized_loads_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_optimized_loads_instances<2,
+                                                                       NHWGK,
+                                                                       GKYXC,
+                                                                       Empty_Tuple,
+                                                                       NHWGC,
+                                                                       ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_data_xdl_f16_optimized_loads_instances<
+                                       2,
+                                       NHWGK,
+                                       GKYXC,
+                                       Empty_Tuple,
+                                       NHWGC,
+                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_optimized_loads_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_optimized_loads_instance.cpp
new file mode 100644
index 0000000000..7d2c2454e1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_optimized_loads_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_optimized_loads_instances<2,
+                                                                       NHWGK,
+                                                                       GKYXC,
+                                                                       Empty_Tuple,
+                                                                       NHWGC,
+                                                                       ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_data_xdl_f32_optimized_loads_instances<
+                                       2,
+                                       NHWGK,
+                                       GKYXC,
+                                       Empty_Tuple,
+                                       NHWGC,
+                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 7f3621a2ba..5987b90685 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -9,6 +9,7 @@ set(GROUPED_CONV2D_FWD
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.cpp
@@ -28,12 +29,14 @@ set(GROUPED_CONV2D_FWD
    xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.cpp
    xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp
    xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # merged groups
    # NHWGC, GKYXC, NHWGK
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # NGCHW, GKCYX, NGKHW
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -44,9 +47,11 @@ set(GROUPED_CONV2D_FWD
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instance.cpp
    # NHWGC, GKYXC, NHWGK
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
    # NGCHW, GKCYX, NGKHW
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
@@ -61,6 +66,7 @@ set(GROUPED_CONV2D_FWD
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instance.cpp
new file mode 100644
index 0000000000..352aa82d9f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instance.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                            NHWGC,
+                                                            GKYXC,
+                                                            Empty_Tuple,
+                                                            NHWGK,
+                                                            ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                            NHWGC,
+                                                            GKYXC,
+                                                            Empty_Tuple,
+                                                            NHWGK,
+                                                            ConvFwd1x1P0>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                            NHWGC,
+                                                            GKYXC,
+                                                            Empty_Tuple,
+                                                            NHWGK,
+                                                            ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                            NHWGC,
+                                                            GKYXC,
+                                                            Empty_Tuple,
+                                                            NHWGK,
+                                                            ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp
new file mode 100644
index 0000000000..8143553d54
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                       NHWGC,
+                                                       GKYXC,
+                                                       Empty_Tuple,
+                                                       NHWGK,
+                                                       ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                       NHWGC,
+                                                       GKYXC,
+                                                       Empty_Tuple,
+                                                       NHWGK,
+                                                       ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp
new file mode 100644
index 0000000000..9a81ccbb82
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    Empty_Tuple,
+                                                                    NHWGK,
+                                                                    ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..676e2d4a27
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instance.cpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwdOddC,
+                                                                                      Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..5601638e77
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instance.cpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwdOddC,
+                                                                                      Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp
new file mode 100644
index 0000000000..5f3f2a2247
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Empty_Tuple,
+                                                                     NHWGK,
+                                                                     ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Empty_Tuple,
+                                                                     NHWGK,
+                                                                     ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt
index c06e4f5953..a801144bfd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt
@@ -85,6 +85,16 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
 )
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instance.in
+  NUM_SHARDS 2
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+
    # merged groups
    # NHWGC, GKYXC, NHWGK
    
@@ -114,6 +124,15 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
 )
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
    #mem
    # NHWGC, GKYXC, NHWGK
    
@@ -143,6 +162,16 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
 )
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
    # NHWGC, GKYXC, NHWGK
    
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
@@ -171,6 +200,16 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
 )
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
    #comp
    # NHWGC, GKYXC, NHWGK
    
@@ -200,7 +239,16 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
 )
-   
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instance.in
new file mode 100644
index 0000000000..d12ae33a8e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instance.in
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances =
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp,
+                                                        TF32,
+                                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances_shard(
+    device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<
+                                               2,
+                                               NHWGC,
+                                               GKYXC,
+                                               Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                               NHWGK,
+                                               ConvFwdDefault,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<
+                                               2,
+                                               NHWGC,
+                                               GKYXC,
+                                               Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                               NHWGK,
+                                               ConvFwd1x1P0,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<
+                                               2,
+                                               NHWGC,
+                                               GKYXC,
+                                               Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                               NHWGK,
+                                               ConvFwd1x1S1P0,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instance.in
new file mode 100644
index 0000000000..6073ad94d3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instance.in
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances =
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp,
+                                                        TF32,
+                                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances_shard(
+    device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances<
+                                           2,
+                                           NHWGC,
+                                           GKYXC,
+                                           Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                           NHWGK,
+                                           ConvFwdDefault,
+                                           Tuple<F32, F32, F32, F32, F32>,
+                                           BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instance.in
new file mode 100644
index 0000000000..f516770698
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instance.in
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances =
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp,
+                                                        TF32,
+                                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances_shard(
+    device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               2,
+                                               NHWGC,
+                                               GKYXC,
+                                               Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                               NHWGK,
+                                               ConvFwdDefault,
+                                               Interwave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               2,
+                                               NHWGC,
+                                               GKYXC,
+                                               Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                               NHWGK,
+                                               ConvFwd1x1P0,
+                                               Interwave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               2,
+                                               NHWGC,
+                                               GKYXC,
+                                               Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                               NHWGK,
+                                               ConvFwd1x1S1P0,
+                                               Interwave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instance.in
new file mode 100644
index 0000000000..75aabfaa94
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instance.in
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances =
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp,
+                                                        TF32,
+                                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances_shard(
+    device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               2,
+                                               NHWGC,
+                                               GKYXC,
+                                               Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                               NHWGK,
+                                               ConvFwdDefault,
+                                               Intrawave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               2,
+                                               NHWGC,
+                                               GKYXC,
+                                               Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                               NHWGK,
+                                               ConvFwd1x1P0,
+                                               Intrawave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               2,
+                                               NHWGC,
+                                               GKYXC,
+                                               Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                               NHWGK,
+                                               ConvFwd1x1S1P0,
+                                               Intrawave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instance.in
new file mode 100644
index 0000000000..3d147035db
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instance.in
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances =
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp,
+                                                        TF32,
+                                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances_shard(
+    device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<
+                                           2,
+                                           NHWGC,
+                                           GKYXC,
+                                           Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                           NHWGK,
+                                           ConvFwdDefault,
+                                           Tuple<F32, F32, F32, F32, F32>,
+                                           BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<
+                                           2,
+                                           NHWGC,
+                                           GKYXC,
+                                           Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                           NHWGK,
+                                           ConvFwd3x3,
+                                           Tuple<F32, F32, F32, F32, F32>,
+                                           BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt
index e63ac766b6..41274f8027 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt
@@ -21,10 +21,16 @@ add_instance_library(device_grouped_conv2d_fwd_bias_clamp_instance
    xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
 
    xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
    xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
    xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_inter_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_comp_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_comp_instance.cpp
new file mode 100644
index 0000000000..61b471cb1c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_comp_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                            NHWGC,
+                                                            GKYXC,
+                                                            Tuple<NHWGK>,
+                                                            NHWGK,
+                                                            ConvFwdDefault,
+                                                            Tuple<F32>,
+                                                            AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                                                       NHWGC,
+                                                                                       GKYXC,
+                                                                                       Tuple<NHWGK>,
+                                                                                       NHWGK,
+                                                                                       ConvFwd1x1P0,
+                                                                                       Tuple<F32>,
+                                                                                       AddClamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                            NHWGC,
+                                                            GKYXC,
+                                                            Tuple<NHWGK>,
+                                                            NHWGK,
+                                                            ConvFwd1x1S1P0,
+                                                            Tuple<F32>,
+                                                            AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..0bf7f8b7b9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..b982a92b02
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    Tuple<NHWGK>,
+                                                                    NHWGK,
+                                                                    ConvFwdDefault,
+                                                                    Tuple<F32>,
+                                                                    AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..d9835d7658
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_inter_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Tuple<NHWGK>,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Interwave,
+                                                           Tuple<F32>,
+                                                           AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Tuple<NHWGK>,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave,
+                                                                                      Tuple<F32>,
+                                                                                      AddClamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Tuple<NHWGK>,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave,
+                                                           Tuple<F32>,
+                                                           AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..43c04443c4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_intra_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Tuple<NHWGK>,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Intrawave,
+                                                           Tuple<F32>,
+                                                           AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Tuple<NHWGK>,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave,
+                                                                                      Tuple<F32>,
+                                                                                      AddClamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Tuple<NHWGK>,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave,
+                                                           Tuple<F32>,
+                                                           AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..77905b3f67
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Tuple<NHWGK>,
+                                                                     NHWGK,
+                                                                     ConvFwdDefault,
+                                                                     Tuple<F32>,
+                                                                     AddClamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Tuple<NHWGK>,
+                                                                     NHWGK,
+                                                                     ConvFwd3x3,
+                                                                     Tuple<F32>,
+                                                                     AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt
index 8faed08c05..f0404cd0f4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt
@@ -21,10 +21,16 @@ add_instance_library(device_grouped_conv2d_fwd_clamp_instance
    xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
 
    xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
    xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
    xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_inter_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_comp_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_comp_instance.cpp
new file mode 100644
index 0000000000..9977482f8a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_comp_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                            NHWGC,
+                                                            GKYXC,
+                                                            Tuple<>,
+                                                            NHWGK,
+                                                            ConvFwdDefault,
+                                                            Tuple<>,
+                                                            Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                                                       NHWGC,
+                                                                                       GKYXC,
+                                                                                       Tuple<>,
+                                                                                       NHWGK,
+                                                                                       ConvFwd1x1P0,
+                                                                                       Tuple<>,
+                                                                                       Clamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<2,
+                                                            NHWGC,
+                                                            GKYXC,
+                                                            Tuple<>,
+                                                            NHWGK,
+                                                            ConvFwd1x1S1P0,
+                                                            Tuple<>,
+                                                            Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..a4b16917bb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..f4933e62b8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    Tuple<>,
+                                                                    NHWGK,
+                                                                    ConvFwdDefault,
+                                                                    Tuple<>,
+                                                                    Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..b1e53145e3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_inter_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Tuple<>,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Interwave,
+                                                           Tuple<>,
+                                                           Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Tuple<>,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave,
+                                                                                      Tuple<>,
+                                                                                      Clamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Tuple<>,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave,
+                                                           Tuple<>,
+                                                           Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..74555cc227
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_tf32_mem_intra_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Tuple<>,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Intrawave,
+                                                           Tuple<>,
+                                                           Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Tuple<>,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave,
+                                                                                      Tuple<>,
+                                                                                      Clamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Tuple<>,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave,
+                                                           Tuple<>,
+                                                           Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..b004b4f3cf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Tuple<>,
+                                                                     NHWGK,
+                                                                     ConvFwdDefault,
+                                                                     Tuple<>,
+                                                                     Clamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Tuple<>,
+                                                                     NHWGK,
+                                                                     ConvFwd3x3,
+                                                                     Tuple<>,
+                                                                     Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
index 4bb05e5000..8652d9fd9d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
@@ -9,6 +9,9 @@ set(GROUPED_CONV3D_BWD_DATA
    xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_optimized_loads_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_optimized_loads_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_optimized_loads_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_optimized_loads_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_optimized_loads_instance.cpp
new file mode 100644
index 0000000000..63cdfcdad8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_optimized_loads_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_optimized_loads_instances<3,
+                                                                        NDHWGK,
+                                                                        GKZYXC,
+                                                                        Empty_Tuple,
+                                                                        NDHWGC,
+                                                                        ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_data_xdl_bf16_optimized_loads_instances<
+                                       3,
+                                       NDHWGK,
+                                       GKZYXC,
+                                       Empty_Tuple,
+                                       NDHWGC,
+                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_optimized_loads_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_optimized_loads_instance.cpp
new file mode 100644
index 0000000000..7a1ac75a03
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_optimized_loads_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_optimized_loads_instances<3,
+                                                                       NDHWGK,
+                                                                       GKZYXC,
+                                                                       Empty_Tuple,
+                                                                       NDHWGC,
+                                                                       ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_data_xdl_f16_optimized_loads_instances<
+                                       3,
+                                       NDHWGK,
+                                       GKZYXC,
+                                       Empty_Tuple,
+                                       NDHWGC,
+                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_optimized_loads_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_optimized_loads_instance.cpp
new file mode 100644
index 0000000000..c76f32479e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_optimized_loads_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_optimized_loads_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_optimized_loads_instances<3,
+                                                                       NDHWGK,
+                                                                       GKZYXC,
+                                                                       Empty_Tuple,
+                                                                       NDHWGC,
+                                                                       ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_data_xdl_f32_optimized_loads_instances<
+                                       3,
+                                       NDHWGK,
+                                       GKZYXC,
+                                       Empty_Tuple,
+                                       NDHWGC,
+                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index 1d9d75a104..5774db21c9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -7,6 +7,7 @@ set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
@@ -19,10 +20,12 @@ set(GROUPED_CONV3D_FWD
    xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
 
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -30,13 +33,16 @@ set(GROUPED_CONV3D_FWD
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instance.cpp
 
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.cpp
-   
-      xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
-xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instance.cpp
+
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instance.cpp
new file mode 100644
index 0000000000..63ff09234c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instance.cpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
new file mode 100644
index 0000000000..c56cadde99
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGK,
+                                                       ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGK,
+                                                       ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
new file mode 100644
index 0000000000..b6c8cd1bdb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    Empty_Tuple,
+                                                                    NDHWGK,
+                                                                    ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..fe6141ac69
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instance.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwdDefault,
+                                                           Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                                                      NDHWGC,
+                                                                                      GKZYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NDHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..633123e3c8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instance.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwdDefault,
+                                                           Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                                                      NDHWGC,
+                                                                                      GKZYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NDHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
new file mode 100644
index 0000000000..d4a05792d7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     Empty_Tuple,
+                                                                     NDHWGK,
+                                                                     ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     Empty_Tuple,
+                                                                     NDHWGK,
+                                                                     ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt
index bda9149227..b6377ba2b4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt
@@ -2,7 +2,7 @@
 set(GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP)
 include(ShardInstantiation)
 
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances
@@ -11,7 +11,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
@@ -20,7 +20,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
@@ -29,7 +29,16 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl
 )
-   
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances
@@ -38,7 +47,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances
@@ -47,7 +56,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances
@@ -58,7 +67,7 @@ generate_sharded_instantiations(
 )
    # large tensor
    # NDHWGC, GKZYXC, NDHWGK
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances
@@ -67,7 +76,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances
@@ -76,7 +85,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances
@@ -85,9 +94,19 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
 )
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in
+  NUM_SHARDS 2
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+
    # merged groups
    # NDHWGC, GKZYXC, NDHWGK
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances
@@ -96,7 +115,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances
@@ -105,7 +124,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances
@@ -114,9 +133,18 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
 )
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
    #mem
    # NDHWGC, GKZYXC, NDHWGK
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances
@@ -125,7 +153,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances
@@ -134,7 +162,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances
@@ -144,7 +172,16 @@ generate_sharded_instantiations(
   OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
 )
    # NDHWGC, GKZYXC, NDHWGK
-   
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances
@@ -153,7 +190,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances
@@ -162,7 +199,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances
@@ -171,9 +208,19 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
 )
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
    #comp
    # NDHWGC, GKZYXC, NDHWGK
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances
@@ -182,7 +229,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances
@@ -191,7 +238,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances
@@ -200,7 +247,16 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
 )
-   
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances
@@ -209,7 +265,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances
@@ -218,7 +274,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances
@@ -227,7 +283,7 @@ generate_sharded_instantiations(
   SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
   OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
 )
-   
+
 set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
 generate_sharded_instantiations(
   INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instance.in
new file mode 100644
index 0000000000..352b8207b3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instance.in
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances =
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp,
+                                        TF32,
+                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances_shard(
+    device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwdDefault,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwd1x1P0,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwd1x1S1P0,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in
new file mode 100644
index 0000000000..d7f3c87b83
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances =
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp,
+                                        TF32,
+                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances_shard(
+    device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwdDefault,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwd1x1P0,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwd1x1S1P0,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in
new file mode 100644
index 0000000000..74308b1c9d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances =
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp,
+                                        TF32,
+                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances_shard(
+    device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances<
+                                           3,
+                                           NDHWGC,
+                                           GKZYXC,
+                                           Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                           NDHWGK,
+                                           ConvFwdDefault,
+                                           Tuple<F32, F32, F32, F32, F32>,
+                                           BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instance.in
new file mode 100644
index 0000000000..b87dce8411
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instance.in
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances =
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp,
+                                        TF32,
+                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances_shard(
+    device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwdDefault,
+                                               Interwave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwd1x1P0,
+                                               Interwave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwd1x1S1P0,
+                                               Interwave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instance.in
new file mode 100644
index 0000000000..c1df1e262e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instance.in
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances =
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp,
+                                        TF32,
+                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances_shard(
+    device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwdDefault,
+                                               Intrawave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwd1x1P0,
+                                               Intrawave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<
+                                               3,
+                                               NDHWGC,
+                                               GKZYXC,
+                                               Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                               NDHWGK,
+                                               ConvFwd1x1S1P0,
+                                               Intrawave,
+                                               Tuple<F32, F32, F32, F32, F32>,
+                                               BiasNormalizeInInferClamp>,
+                                           Shards,
+                                           ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in
new file mode 100644
index 0000000000..a857b7de4f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.in
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances =
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp,
+                                        TF32,
+                                        TF32>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances_shard(
+    device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<
+                                           3,
+                                           NDHWGC,
+                                           GKZYXC,
+                                           Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                           NDHWGK,
+                                           ConvFwdDefault,
+                                           Tuple<F32, F32, F32, F32, F32>,
+                                           BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<
+                                           3,
+                                           NDHWGC,
+                                           GKZYXC,
+                                           Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                           NDHWGK,
+                                           ConvFwd3x3,
+                                           Tuple<F32, F32, F32, F32, F32>,
+                                           BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt
index 3bd6916cf0..ef7cc22bc4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt
@@ -19,10 +19,17 @@ set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
    xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
    xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_intra_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
-)
+   xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_comp_instance.cpp
+
+   xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
+   )
 
 add_instance_library(device_grouped_conv3d_fwd_bias_clamp_instance ${GROUPED_CONV3D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_comp_instance.cpp
new file mode 100644
index 0000000000..4b60dd1b3e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_comp_instance.cpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Tuple<NDHWGK>,
+                                                            NDHWGK,
+                                                            ConvFwdDefault,
+                                                            Tuple<F32>,
+                                                            AddClamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Tuple<NDHWGK>,
+                                                            NDHWGK,
+                                                            ConvFwd1x1P0,
+                                                            Tuple<F32>,
+                                                            AddClamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Tuple<NDHWGK>,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0,
+                                                            Tuple<F32>,
+                                                            AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..328838bff2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..04d750d2b9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    Tuple<NDHWGK>,
+                                                                    NDHWGK,
+                                                                    ConvFwdDefault,
+                                                                    Tuple<F32>,
+                                                                    AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..765719c7b5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_inter_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Tuple<NDHWGK>,
+                                                           NDHWGK,
+                                                           ConvFwdDefault,
+                                                           Interwave,
+                                                           Tuple<F32>,
+                                                           AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                                                      NDHWGC,
+                                                                                      GKZYXC,
+                                                                                      Tuple<NDHWGK>,
+                                                                                      NDHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave,
+                                                                                      Tuple<F32>,
+                                                                                      AddClamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Tuple<NDHWGK>,
+                                                           NDHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave,
+                                                           Tuple<F32>,
+                                                           AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..0daf28adef
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_intra_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Tuple<NDHWGK>,
+                                                           NDHWGK,
+                                                           ConvFwdDefault,
+                                                           Intrawave,
+                                                           Tuple<F32>,
+                                                           AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                                                      NDHWGC,
+                                                                                      GKZYXC,
+                                                                                      Tuple<NDHWGK>,
+                                                                                      NDHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave,
+                                                                                      Tuple<F32>,
+                                                                                      AddClamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Tuple<NDHWGK>,
+                                                           NDHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave,
+                                                           Tuple<F32>,
+                                                           AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..2988b715e0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     Tuple<NDHWGK>,
+                                                                     NDHWGK,
+                                                                     ConvFwdDefault,
+                                                                     Tuple<F32>,
+                                                                     AddClamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     Tuple<NDHWGK>,
+                                                                     NDHWGK,
+                                                                     ConvFwd3x3,
+                                                                     Tuple<F32>,
+                                                                     AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bilinear/CMakeLists.txt
index 436c37fd58..6a4637d6e1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bilinear/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bilinear/CMakeLists.txt
@@ -3,6 +3,7 @@ set(GROUPED_CONV3D_FWD_BILINEAR
    xdl/device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp)
 
 add_instance_library(device_grouped_conv3d_fwd_bilinear_instance ${GROUPED_CONV3D_FWD_BILINEAR})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bilinear/xdl/device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bilinear/xdl/device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
new file mode 100644
index 0000000000..869c812b50
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bilinear/xdl/device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_bilinear_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                ck::Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Bilinear,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bilinear_f32_tf32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bilinear_f32_tf32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bilinear_f32_tf32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt
index 234533244e..0c126b2084 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt
@@ -19,10 +19,17 @@ set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
    xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
    xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_intra_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
-)
+   xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_comp_instance.cpp
+
+   xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
+   )
 
 add_instance_library(device_grouped_conv3d_fwd_clamp_instance ${GROUPED_CONV3D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_comp_instance.cpp
new file mode 100644
index 0000000000..3a99d693f9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_comp_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Tuple<>,
+                                                            NDHWGK,
+                                                            ConvFwdDefault,
+                                                            Tuple<>,
+                                                            Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<3,
+                                                                                       NDHWGC,
+                                                                                       GKZYXC,
+                                                                                       Tuple<>,
+                                                                                       NDHWGK,
+                                                                                       ConvFwd1x1P0,
+                                                                                       Tuple<>,
+                                                                                       Clamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Tuple<>,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0,
+                                                            Tuple<>,
+                                                            Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..a1bf6562c2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..5859576835
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    Tuple<>,
+                                                                    NDHWGK,
+                                                                    ConvFwdDefault,
+                                                                    Tuple<>,
+                                                                    Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..905da7e1d0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_inter_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Tuple<>,
+                                                           NDHWGK,
+                                                           ConvFwdDefault,
+                                                           Interwave,
+                                                           Tuple<>,
+                                                           Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                                                      NDHWGC,
+                                                                                      GKZYXC,
+                                                                                      Tuple<>,
+                                                                                      NDHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave,
+                                                                                      Tuple<>,
+                                                                                      Clamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Tuple<>,
+                                                           NDHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave,
+                                                           Tuple<>,
+                                                           Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..008dd28921
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_mem_intra_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_tf32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Tuple<>,
+                                                           NDHWGK,
+                                                           ConvFwdDefault,
+                                                           Intrawave,
+                                                           Tuple<>,
+                                                           Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                                                      NDHWGC,
+                                                                                      GKZYXC,
+                                                                                      Tuple<>,
+                                                                                      NDHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave,
+                                                                                      Tuple<>,
+                                                                                      Clamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_tf32_mem_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Tuple<>,
+                                                           NDHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave,
+                                                           Tuple<>,
+                                                           Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
new file mode 100644
index 0000000000..66874c5696
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_tf32_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     Tuple<>,
+                                                                     NDHWGK,
+                                                                     ConvFwdDefault,
+                                                                     Tuple<>,
+                                                                     Clamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_tf32_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     Tuple<>,
+                                                                     NDHWGK,
+                                                                     ConvFwd3x3,
+                                                                     Tuple<>,
+                                                                     Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scale/CMakeLists.txt
index f36d55d367..47fc2655bb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scale/CMakeLists.txt
@@ -3,6 +3,7 @@ set(GROUPED_CONV3D_FWD_BILINEAR
    xdl/device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp)
 
 add_instance_library(device_grouped_conv3d_fwd_scale_instance ${GROUPED_CONV3D_FWD_BILINEAR})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scale/xdl/device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scale/xdl/device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
new file mode 100644
index 0000000000..5377cc56bd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scale/xdl/device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instance.cpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scale_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                ck::Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Scale,
+                                                                TF32,
+                                                                TF32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scale_f32_tf32_instances<3,
+                                                             NDHWGC,
+                                                             GKZYXC,
+                                                             Tuple<>,
+                                                             NDHWGK,
+                                                             ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scale_f32_tf32_instances<3,
+                                                             NDHWGC,
+                                                             GKZYXC,
+                                                             Tuple<>,
+                                                             NDHWGK,
+                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_scale_f32_tf32_instances<3,
+                                                             NDHWGC,
+                                                             GKZYXC,
+                                                             Tuple<>,
+                                                             NDHWGK,
+                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
index 5d50902be8..a5b4fb5df4 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
@@ -20,6 +20,12 @@ list(APPEND GEMM_QUANT_SRC
     gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
     gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp)
 
+list(APPEND GEMM_QUANT_SRC
+    gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+    gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+    gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+    gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp)
+
 add_instance_library(device_quantization_instance
     ${CONV2D_PERLAYER_QUANT_SRC}
     ${CONV2D_PERCHANNEL_QUANT_SRC}
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_instance.hpp
new file mode 100644
index 0000000000..3737f0a958
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_instance.hpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_quantization_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <typename OutElementOp,
+          BlockGemmPipelineScheduler GemmPipelineScheduler,
+          BlockGemmPipelineVersion GemmPipeline>
+using device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|   DsLayout| ELayout|  AData|  BData|      DsData|    EData| AccData| CShuffle|           A|           B|           CDE|   GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|               BlkGemm|     BlkGemm| ComputeTypeA| ComputeTypeB|
+        //################################|        |        |           |        |   Type|   Type|        Type|     Type|    Type| DataType| Elementwise| Elementwise|   Elementwise|           |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|             PipeSched| PipelineVer|             |             |
+        //################################|        |        |           |        |       |       |            |         |        |         |   Operation|   Operation|     Operation|           |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |                      |            |             |             |
+        //################################|        |        |           |        |       |       |            |         |        |         |            |            |              |           |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |                      |            |             |             |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Col,    Row, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,          1,          1,       S<1, 32, 1, 8>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Col,    Row, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,          1,          1,       S<1, 32, 1, 8>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Col,    Row, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,          1,          1,       S<1, 32, 1, 4>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Col,    Row, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,     false,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,     false,          1,          1,       S<1, 16, 1, 4>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>
+    // clang-format on
+    >;
+
+template <typename OutElementOp,
+          BlockGemmPipelineScheduler GemmPipelineScheduler,
+          BlockGemmPipelineVersion GemmPipeline>
+using device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|   DsLayout| ELayout| AData|  BData|      DsData|    EData| AccData| CShuffle|           A|           B|           CDE|   GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|               BlkGemm|     BlkGemm| ComputeTypeA| ComputeTypeB|
+        //################################|        |        |           |        |  Type|   Type|        Type|     Type|    Type| DataType| Elementwise| Elementwise|   Elementwise|           |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|             PipeSched| PipelineVer|             |             |
+        //################################|        |        |           |        |      |       |            |         |        |         |   Operation|   Operation|     Operation|           |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |                      |            |             |             |
+        //################################|        |        |           |        |      |       |            |         |        |         |            |            |              |           |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |                      |            |             |             |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,          1,          1,       S<1, 32, 1, 8>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,          1,          1,       S<1, 32, 1, 8>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,          1,          1,       S<1, 64, 1, 4>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,     false,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,          1,          1,       S<1, 64, 1, 2>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>
+    // clang-format on
+    >;
+
+template <typename OutElementOp,
+          BlockGemmPipelineScheduler GemmPipelineScheduler,
+          BlockGemmPipelineVersion GemmPipeline>
+using device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|   DsLayout| ELayout|  AData|  BData|      DsData|    EData| AccData| CShuffle|           A|           B|           CDE|   GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|               BlkGemm|     BlkGemm| ComputeTypeA| ComputeTypeB|
+        //################################|        |        |           |        |   Type|   Type|        Type|     Type|    Type| DataType| Elementwise| Elementwise|   Elementwise|           |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|             PipeSched| PipelineVer|             |             |
+        //################################|        |        |           |        |       |       |            |         |        |         |   Operation|   Operation|     Operation|           |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |                      |            |             |             |
+        //################################|        |        |           |        |       |       |            |         |        |         |            |            |              |           |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |                      |            |             |             |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Row,    Row, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,     false,          1,          1,       S<1, 32, 1, 8>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Row,    Row, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,     false,          1,          1,       S<1, 32, 1, 4>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Row,    Row, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,          1,          1,       S<1, 32, 1, 8>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Row,    Row, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,          1,          1,       S<1, 32, 1, 8>,                     S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>
+    // clang-format on
+    >;
+
+template <typename OutElementOp,
+          BlockGemmPipelineScheduler GemmPipelineScheduler,
+          BlockGemmPipelineVersion GemmPipeline>
+using device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|   DsLayout| ELayout|  AData|  BData|      DsData|    EData| AccData| CShuffle|           A|           B|           CDE|   GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|               BlkGemm|     BlkGemm| ComputeTypeA| ComputeTypeB|
+        //################################|        |        |           |        |   Type|   Type|        Type|     Type|    Type| DataType| Elementwise| Elementwise|   Elementwise|           |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors|             PipeSched| PipelineVer|             |             |
+        //################################|        |        |           |        |       |       |            |         |        |         |   Operation|   Operation|     Operation|           |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |                      |            |             |             |
+        //################################|        |        |           |        |       |       |            |         |        |         |            |            |              |           |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |                      |            |             |             |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Row,    Col, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,          1,          1,        S<1, 32, 1, 8>,                    S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Row,    Col, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,          1,          1,        S<1, 32, 1, 4>,                    S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Row,    Col, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,          1,          1,        S<1, 32, 1, 8>,                    S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<    Row,    Col, Empty_Tuple,     Row, int8_t, int8_t, Empty_Tuple,   int8_t, int32_t,  int32_t, PassThrough, PassThrough,  OutElementOp, MNKPadding,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,          1,          1,        S<1, 16, 1, 4>,                    S<1>, GemmPipelineScheduler, GemmPipeline,      int8_t,      int8_t>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
new file mode 100644
index 0000000000..a3838bb398
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          int8_t,
+                                                          int8_t,
+                                                          Empty_Tuple,
+                                                          int8_t,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instances<
+            Mul_Clamp,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v3>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_kn_mn_instances<
+            Mul_Clamp,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
new file mode 100644
index 0000000000..31ff723166
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          int8_t,
+                                                          int8_t,
+                                                          Empty_Tuple,
+                                                          int8_t,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instances<
+            Mul_Clamp,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v3>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_km_nk_mn_instances<
+            Mul_Clamp,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
new file mode 100644
index 0000000000..07a632a77c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          int8_t,
+                                                          int8_t,
+                                                          Empty_Tuple,
+                                                          int8_t,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instances<
+            Mul_Clamp,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v3>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_kn_mn_instances<
+            Mul_Clamp,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
new file mode 100644
index 0000000000..ed9cc908ef
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          int8_t,
+                                                          int8_t,
+                                                          Empty_Tuple,
+                                                          int8_t,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instances<
+            Mul_Clamp,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v3>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_wmma_c_shuffle_i8_i8_i8_mk_nk_mn_instances<
+            Mul_Clamp,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
index e7c2500fef..a4eb29c7a1 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -33,7 +33,8 @@ using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<
 using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<PassThrough>;
 using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Relu>;
 
-static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto MNKPadding  = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 
 } // namespace instance
 } // namespace device
diff --git a/library/src/utility/host_tensor.cpp b/library/src/utility/host_tensor.cpp
index 7211552641..cc394f2535 100644
--- a/library/src/utility/host_tensor.cpp
+++ b/library/src/utility/host_tensor.cpp
@@ -5,18 +5,6 @@
 
 #include "ck/library/utility/host_tensor.hpp"
 
-void HostTensorDescriptor::CalculateStrides()
-{
-    mStrides.clear();
-    mStrides.resize(mLens.size(), 0);
-    if(mStrides.empty())
-        return;
-
-    mStrides.back() = 1;
-    std::partial_sum(
-        mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
-}
-
 std::size_t HostTensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
 
 std::size_t HostTensorDescriptor::GetElementSize() const
@@ -53,7 +41,18 @@ std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
 
     os << "strides {";
     LogRange(os, desc.GetStrides(), ", ");
-    os << "}";
+    os << "} ";
 
     return os;
 }
+
+std::ostream& operator<<(std::ostream& os, HostTensorDescriptor::ChosenLayout tag)
+{
+    switch(tag)
+    {
+    case HostTensorDescriptor::ChosenLayout::Original: os << "Original"; break;
+    case HostTensorDescriptor::ChosenLayout::RowMajor: os << "RowMajor"; break;
+    case HostTensorDescriptor::ChosenLayout::ColumnMajor: os << "ColumnMajor"; break;
+    }
+    return os;
+}
diff --git a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
index caf24f016a..537a4703d3 100644
--- a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
@@ -39,7 +39,8 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
                                  std::vector<index_t> window_strides,
                                  std::vector<index_t> window_dilations,
                                  std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
     constexpr index_t InOutRank  = 4;
     constexpr index_t WindowRank = 2;
@@ -82,7 +83,9 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
         [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
             using namespace ck::literals;
 
-            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+            return HostTensorDescriptor({N_, C_, H, W},
+                                        {C_ * H * W, 1_uz, W * C_, C_},
+                                        ck::tensor_layout::convolution::NCHW{});
         };
 
     Tensor<DOutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
@@ -164,6 +167,11 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
         {
             ++num_kernel;
             instance_found = true;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -247,7 +255,11 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "avg_pool2d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass && instance_found;
 }
 
diff --git a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
index e7e8f7213f..c97e42228d 100644
--- a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
@@ -48,7 +48,8 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
                                  std::vector<index_t> window_strides,
                                  std::vector<index_t> window_dilations,
                                  std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
     constexpr index_t InOutRank  = 5;
     constexpr index_t WindowRank = 3;
@@ -93,7 +94,8 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
             using namespace ck::literals;
 
             return HostTensorDescriptor({N_, C_, D, H, W},
-                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_},
+                                        ck::tensor_layout::convolution::NDHWC{});
         };
 
     Tensor<DOutDataType> dout_n_c_do_ho_wo(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
@@ -165,6 +167,11 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -245,7 +252,11 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "avg_pool3d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
index 22dab31100..4b0b8e5bcb 100644
--- a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
@@ -116,11 +116,13 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
 
         if(std::is_same<decltype(layout), Row>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
diff --git a/profiler/include/profiler/profile_batched_gemm_b_scale_impl.hpp b/profiler/include/profiler/profile_batched_gemm_b_scale_impl.hpp
index a91191b33d..060fbd70e5 100644
--- a/profiler/include/profiler/profile_batched_gemm_b_scale_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_b_scale_impl.hpp
@@ -66,11 +66,13 @@ bool profile_batched_gemm_b_scale_impl(int do_verification,
 
         if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
index be69b67b5c..ca0d031dba 100644
--- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
@@ -20,6 +20,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 namespace ck {
 namespace profiler {
 
@@ -45,10 +49,10 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
                                                          int O,
                                                          int G0,
                                                          int G1,
-                                                         float alpha = -1.f)
+                                                         float alpha        = -1.f,
+                                                         int instance_index = -1)
 
 {
-
     using PassThrough   = tensor_operation::element_wise::PassThrough;
     using ScaleAdd      = tensor_operation::element_wise::ScaleAdd;
     using AElementOp    = PassThrough;
@@ -107,12 +111,12 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
 
     const int BatchCount = G0 * G1;
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-    Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides);
-    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
-    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
+    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, Row{});
+    Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides, Row{});
+    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, Col{});
+    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
+    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
@@ -273,7 +277,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
     {
@@ -310,6 +314,13 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+
             std::string op_name = op_ptr->GetTypeString();
 
             float ave_time =
@@ -388,6 +399,11 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_bias_softmax_gemm_permute_instance (" << instance_index << "/"
+                  << num_kernel << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
index b585b7d56a..a8571d0779 100644
--- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -110,11 +110,13 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
 
         if(std::is_same<decltype(layout), Row>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
@@ -220,9 +222,10 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
     }
 
     std::string best_op_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
+    float best_ave_time         = 0;
+    float best_tflops           = 0;
+    float best_gb_per_sec       = 0;
+    int num_supported_instances = 0;
 
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
@@ -255,6 +258,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            num_supported_instances++;
             std::string op_name = op_ptr->GetTypeString();
 
             float ave_time =
@@ -309,6 +313,8 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
         }
     }
 
+    printf("\033[36mFound %d supported instances\033[0m\n", num_supported_instances);
+
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
 
diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp
index 92e06e4a70..0fdda68c4d 100644
--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -47,7 +47,8 @@ bool profile_batched_gemm_impl(int do_verification,
                                int BatchStrideA,
                                int BatchStrideB,
                                int BatchStrideC,
-                               int BatchCount)
+                               int BatchCount,
+                               int instance_index = -1)
 {
     bool pass = true;
 
@@ -61,11 +62,13 @@ bool profile_batched_gemm_impl(int do_verification,
 
         if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
@@ -136,6 +139,7 @@ bool profile_batched_gemm_impl(int do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
+    int num_kernel        = 0;
 
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
@@ -201,6 +205,12 @@ bool profile_batched_gemm_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // re-init C to zero before profiling next kernel
             c_device_buf.SetZero();
 
@@ -257,6 +267,11 @@ bool profile_batched_gemm_impl(int do_verification,
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
index 901fa338d4..cb91d8090d 100644
--- a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
@@ -83,11 +83,13 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
 
         if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {row * stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {col * stride, 1_uz, stride}, layout);
         }
     };
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
index 700ada73a1..183b0e183a 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
@@ -40,19 +40,19 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
                                             int N,
                                             int K,
                                             int O,
-                                            int BatchCount    = 1,
-                                            int StrideA       = -1,
-                                            int StrideB0      = -1,
-                                            int StrideB1      = -1,
-                                            int StrideC       = -1,
-                                            int BatchStrideA  = -1,
-                                            int BatchStrideB0 = -1,
-                                            int BatchStrideB1 = -1,
-                                            int BatchStrideC  = -1,
-                                            float alpha       = -1.f)
+                                            int BatchCount     = 1,
+                                            int StrideA        = -1,
+                                            int StrideB0       = -1,
+                                            int StrideB1       = -1,
+                                            int StrideC        = -1,
+                                            int BatchStrideA   = -1,
+                                            int BatchStrideB0  = -1,
+                                            int BatchStrideB1  = -1,
+                                            int BatchStrideC   = -1,
+                                            float alpha        = -1.f,
+                                            int instance_index = -1)
 
 {
-
     using Row           = tensor_layout::gemm::RowMajor;
     using Col           = tensor_layout::gemm::ColumnMajor;
     using PassThrough   = tensor_operation::element_wise::PassThrough;
@@ -118,11 +118,13 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
 
         if(std::is_same<decltype(layout), Row>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
@@ -251,7 +253,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
     {
@@ -283,6 +285,13 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+
             std::string op_name = op_ptr->GetTypeString();
 
             float ave_time =
@@ -339,7 +348,11 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
 
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_softmax_gemm_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
index e3c462e21c..e953cc4b66 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
@@ -20,6 +20,9 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
 
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 namespace ck {
 namespace profiler {
 
@@ -45,10 +48,10 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
                                                     int O,
                                                     int G0,
                                                     int G1,
-                                                    float alpha = -1.f)
+                                                    float alpha        = -1.f,
+                                                    int instance_index = -1)
 
 {
-
     using PassThrough   = tensor_operation::element_wise::PassThrough;
     using Scale         = tensor_operation::element_wise::Scale;
     using AElementOp    = PassThrough;
@@ -101,11 +104,11 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
 
     const int BatchCount = G0 * G1;
 
-    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
-    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
-    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
-    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
+    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, Row{});
+    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, Bypass{});
+    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
+    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
 
     std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
     std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
@@ -251,6 +254,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
+    int num_kernel        = 0;
 
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
@@ -284,6 +288,13 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+
             std::string op_name = op_ptr->GetTypeString();
 
             float ave_time =
@@ -359,7 +370,11 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
 
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_softmax_gemm_permute_instance (" << instance_index << "/"
+                  << num_kernel << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
index 3343b5e66e..bf5a661407 100644
--- a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
@@ -34,7 +34,8 @@ bool profile_batchnorm_backward_impl(bool do_verification,
                                      const std::vector<size_t> inOutLengths,
                                      const std::vector<int> reduceDims,
                                      bool haveSavedMeanInvVar,
-                                     double epsilon)
+                                     double epsilon,
+                                     index_t instance_index = -1)
 {
     if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
     {
@@ -293,6 +294,11 @@ bool profile_batchnorm_backward_impl(bool do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -382,7 +388,11 @@ bool profile_batchnorm_backward_impl(bool do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if (instance_index != -1)
+    {
+        std::cout << "batchnorm_backward_instance (" << instance_index << "/" << num_kernel
+            << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
index 2f9538b16c..078f6bff87 100644
--- a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
@@ -35,7 +35,8 @@ bool profile_batchnorm_forward_impl(int do_verification,
                                     bool updateMovingAverage,
                                     bool saveMeanAndInvVariance,
                                     double averageFactor,
-                                    double epsilon)
+                                    double epsilon,
+                                    index_t instance_index = -1)
 {
     if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
     {
@@ -287,6 +288,11 @@ bool profile_batchnorm_forward_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -404,7 +410,11 @@ bool profile_batchnorm_forward_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "batchnorm_forward_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
index 1b31a2aabf..c866b88e8a 100644
--- a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
@@ -32,7 +32,8 @@ bool profile_batchnorm_infer_impl(int do_verification,
                                   bool time_kernel,
                                   const std::vector<size_t> inOutLengths,
                                   const std::vector<int> reduceDims,
-                                  double epsilon)
+                                  double epsilon,
+                                  index_t instance_index = -1)
 {
     if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
     {
@@ -253,6 +254,11 @@ bool profile_batchnorm_infer_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -327,7 +333,11 @@ bool profile_batchnorm_infer_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if (instance_index != -1)
+    {
+        std::cout << "batchnorm_infer_instance (" << instance_index << "/" << num_kernel
+            << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_contraction_impl.hpp b/profiler/include/profiler/profile_contraction_impl.hpp
index 604032a01d..361861a6d1 100644
--- a/profiler/include/profiler/profile_contraction_impl.hpp
+++ b/profiler/include/profiler/profile_contraction_impl.hpp
@@ -54,25 +54,36 @@ int profile_contraction_impl(ck::index_t do_verification,
                              const std::vector<ck::index_t>& StridesA, // [M0, M1, K0, K1]
                              const std::vector<ck::index_t>& StridesB, // [N0, N1, K0, K1]
                              const std::vector<ck::index_t>& StridesE, // [M0, M1, N0, N1]
-                             const std::vector<ck::index_t>& StridesD) // [M0, M1, N0, N1]
+                             const std::vector<ck::index_t>& StridesD, // [M0, M1, N0, N1]
+                             int instance_index = -1)
 {
     bool pass = true;
 
     auto f_host_tensor_descriptor = [](const std::vector<ck::index_t>& dims01,
                                        const std::vector<ck::index_t>& dims23,
-                                       const std::vector<ck::index_t>& strides) {
+                                       const std::vector<ck::index_t>& strides,
+                                       auto layout) {
         std::vector<std::size_t> dims_szt(dims01.begin(), dims01.end());
         dims_szt.insert(dims_szt.end(), dims23.begin(), dims23.end());
-        std::vector<std::size_t> strides_szt(strides.begin(), strides.end());
 
-        return HostTensorDescriptor(dims_szt, strides);
+        // For ColumnMajor with more than 2 dimensions, the strides are custom-defined, so skip
+        // verification.
+        if constexpr(ck::is_same_v<decltype(layout), ck::tensor_layout::gemm::ColumnMajor>)
+        {
+            if(strides.size() > 2)
+            {
+                return HostTensorDescriptor(
+                    dims_szt, strides, ck::tensor_layout::BypassLayoutVerification{});
+            }
+        }
+        return HostTensorDescriptor(dims_szt, strides, layout);
     };
 
-    Tensor<DataType> a_m_k(f_host_tensor_descriptor(M, K, StridesA));
-    Tensor<DataType> b_n_k(f_host_tensor_descriptor(N, K, StridesB));
-    Tensor<DataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE));
-    Tensor<DataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StridesE));
-    Tensor<DataType> d_m_n(f_host_tensor_descriptor(M, N, StridesD));
+    Tensor<DataType> a_m_k(f_host_tensor_descriptor(M, K, StridesA, ALayout{}));
+    Tensor<DataType> b_n_k(f_host_tensor_descriptor(N, K, StridesB, BLayout{}));
+    Tensor<DataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE, CDELayout{}));
+    Tensor<DataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StridesE, CDELayout{}));
+    Tensor<DataType> d_m_n(f_host_tensor_descriptor(M, N, StridesD, CDELayout{}));
 
     std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
     std::cout << "b_n_k: " << b_n_k.mDesc << std::endl;
@@ -160,7 +171,7 @@ int profile_contraction_impl(ck::index_t do_verification,
         auto ref_op      = ReferenceGemmInstance{};
         auto ref_invoker = ref_op.MakeInvoker();
 
-        Tensor<DataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE));
+        Tensor<DataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE, CDELayout{}));
 
         auto ref_argument =
             ref_op.MakeArgument(a_m_k, b_n_k, c_m_n_host_result, a_element_op, b_element_op);
@@ -187,7 +198,7 @@ int profile_contraction_impl(ck::index_t do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
     {
@@ -246,6 +257,12 @@ int profile_contraction_impl(ck::index_t do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // re-init C to zero before profiling next kernel
             e_device_buf.SetZero();
 
@@ -366,6 +383,11 @@ int profile_contraction_impl(ck::index_t do_verification,
               << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
               << best_op_name << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "contraction_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
index 5ea1a78094..8f7adebdd4 100644
--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
@@ -58,7 +58,8 @@ bool profile_conv_bwd_data_impl(int do_verification,
                                 int init_method,
                                 bool do_log,
                                 bool time_kernel,
-                                const ck::utils::conv::ConvParam& conv_param)
+                                const ck::utils::conv::ConvParam& conv_param,
+                                int instance_index = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -174,7 +175,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device Conv instances
     bool pass = true;
 
@@ -200,6 +201,12 @@ bool profile_conv_bwd_data_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // for conv bwd data, some input tensor element are zero, but not written by kernel,
             // need to set zero
             in_device_buf.SetZero();
@@ -263,7 +270,11 @@ bool profile_conv_bwd_data_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "conv_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp
index 37366821c4..200409fe61 100644
--- a/profiler/include/profiler/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
@@ -36,7 +36,8 @@ bool profile_conv_fwd_impl(int do_verification,
                            int init_method,
                            bool do_log,
                            bool time_kernel,
-                           const ck::utils::conv::ConvParam& conv_param)
+                           const ck::utils::conv::ConvParam& conv_param,
+                           int instance_index = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -156,7 +157,7 @@ bool profile_conv_fwd_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     bool pass = true;
 
@@ -182,6 +183,12 @@ bool profile_conv_fwd_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // re-init output to zero before profiling next kernel
             out_device_buf.SetZero();
 
@@ -236,7 +243,11 @@ bool profile_conv_fwd_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "conv_fwd_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
index 14182bb7b0..171ae1662b 100644
--- a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
+++ b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
@@ -100,12 +100,12 @@ static auto create_gemm_desc(const ck::index_t G, const ck::index_t NDoHoWo, con
     if constexpr(std::is_same_v<InputLayout, GNWC> || std::is_same_v<InputLayout, GNHWC> ||
                  std::is_same_v<InputLayout, GNDHWC>)
     {
-        return HostTensorDescriptor({G, NDoHoWo, CZYX});
+        return HostTensorDescriptor({G, NDoHoWo, CZYX}, InputLayout{});
     }
     else if constexpr(std::is_same_v<InputLayout, NWGC> || std::is_same_v<InputLayout, NHWGC> ||
                       std::is_same_v<InputLayout, NDHWGC>)
     {
-        return HostTensorDescriptor({G, NDoHoWo, CZYX}, {CZYX, CZYX * G, 1});
+        return HostTensorDescriptor({G, NDoHoWo, CZYX}, {CZYX, CZYX * G, 1}, InputLayout{});
     }
     else
     {
@@ -122,7 +122,8 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
                                         int init_method,
                                         bool do_log,
                                         bool time_kernel,
-                                        const ck::utils::conv::ConvParam& conv_param)
+                                        const ck::utils::conv::ConvParam& conv_param,
+                                        index_t instance_index = -1)
 {
     const ck::index_t NDoHoWo =
         conv_param.N_ *
@@ -226,7 +227,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
     // profile device op instances
     bool pass                   = true;
     bool is_supporting_instance = false;
-
+    index_t num_kernel          = 0;
     for(auto& op_ptr : op_ptrs)
     {
         auto argument_ptr = op_ptr->MakeArgumentPointer(
@@ -247,6 +248,12 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             is_supporting_instance = true;
             // re-init output to zero before profiling next kernel
             out_device_buf.SetZero();
@@ -291,6 +298,11 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\nGB/s: " << best_gb_per_sec << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "conv_tensor_rearrange_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return is_supporting_instance && pass;
 }
 
diff --git a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
index 220076465d..ca08f48bcf 100644
--- a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
@@ -49,7 +49,8 @@ bool profile_elementwise_layernorm_impl(int do_verification,
                                         int init_method,
                                         bool do_log,
                                         bool time_kernel,
-                                        std::vector<index_t> length)
+                                        std::vector<index_t> length,
+                                        index_t instance_index = -1)
 {
     using Add         = ck::tensor_operation::element_wise::Add;
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -199,6 +200,11 @@ bool profile_elementwise_layernorm_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -270,6 +276,11 @@ bool profile_elementwise_layernorm_impl(int do_verification,
         return false;
     }
 
+    if(instance_index != -1)
+    {
+        std::cout << "elementwise_layernorm_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp b/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp
index d68a1065ab..f17516a47d 100644
--- a/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp
@@ -19,7 +19,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -75,10 +74,6 @@ bool profile_gemm_ab_scale_impl(int do_verification,
                                       ? ((K + ScaleBlockK - 1) / ScaleBlockK)
                                       : ((N + ScaleBlockN - 1) / ScaleBlockN);
 
-    ck::utils::validate_gemm_stride<ALayout>(M, K, StrideA, "StrideA");
-    ck::utils::validate_gemm_stride<BLayout>(K, N, StrideB, "StrideB");
-    ck::utils::validate_gemm_stride<BLayout>(M, N, StrideE, "StrideE");
-
     Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<A1DataType> a1_m_k(f_host_tensor_descriptor((M + ScaleBlockM - 1) / ScaleBlockM,
                                                        (K + ScaleBlockK - 1) / ScaleBlockK,
diff --git a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
index 6f6d881c1e..9e4d30142b 100644
--- a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
@@ -33,7 +33,7 @@ template <typename ADataType,
           typename ELayout>
 bool profile_gemm_add_fastgelu_impl(int do_verification,
                                     int init_method,
-                                    bool /*do_log*/,
+                                    bool do_log,
                                     bool time_kernel,
                                     int M,
                                     int N,
@@ -213,6 +213,17 @@ bool profile_gemm_add_fastgelu_impl(int do_verification,
             {
                 e_device_buf.FromDevice(e_m_n_device_result.mData.data());
 
+                if(do_log)
+                {
+                    LogRangeAsType<float>(
+                        std::cout << "e_m_n_device_result: ", e_m_n_device_result.mData, ",")
+                        << std::endl;
+
+                    LogRangeAsType<float>(
+                        std::cout << "e_m_n_host_result: ", e_m_n_host_result.mData, ",")
+                        << std::endl;
+                }
+
                 pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
             }
         }
diff --git a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp b/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
index 25871dfb2e..fcb546fe96 100644
--- a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
@@ -35,7 +35,7 @@ template <typename ADataType,
           typename ELayout>
 bool profile_gemm_add_multiply_impl(int do_verification,
                                     int init_method,
-                                    bool /*do_log*/,
+                                    bool do_log,
                                     bool time_kernel,
                                     int M,
                                     int N,
@@ -223,6 +223,17 @@ bool profile_gemm_add_multiply_impl(int do_verification,
             {
                 e_device_buf.FromDevice(e_m_n_device_result.mData.data());
 
+                if(do_log)
+                {
+                    LogRangeAsType<float>(
+                        std::cout << "e_m_n_device_result: ", e_m_n_device_result.mData, ",")
+                        << std::endl;
+
+                    LogRangeAsType<float>(
+                        std::cout << "e_m_n_host_result: ", e_m_n_host_result.mData, ",")
+                        << std::endl;
+                }
+
                 pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
             }
         }
diff --git a/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp b/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
index 46591a3525..a8daf4e787 100644
--- a/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
@@ -136,19 +136,27 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification,
         return HostTensorDescriptor({len}, {stride});
     };
 
-    auto f_host_tensor_descriptor2d =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
+    auto f_host_tensor_descriptor2d = [](std::size_t row,
+                                         std::size_t col,
+                                         int& stride,
+                                         auto layout) {
+        using namespace ck::literals;
 
-            if constexpr(std::is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
+        if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+        {
+            auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
+            if(stride <= 0)
+                stride = desc.GetStrides()[0];
+            return desc;
+        }
+        else
+        {
+            auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
+            if(stride <= 0)
+                stride = desc.GetStrides()[1];
+            return desc;
+        }
+    };
 
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_add_relu_impl.hpp b/profiler/include/profiler/profile_gemm_add_relu_impl.hpp
index 5d79a98c11..e7f4338ef0 100644
--- a/profiler/include/profiler/profile_gemm_add_relu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_relu_impl.hpp
@@ -43,19 +43,24 @@ bool profile_gemm_add_relu_impl(int do_verification,
                                 int StrideD0,
                                 int StrideE)
 {
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
+    auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) {
+        using namespace ck::literals;
 
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
+        if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+        {
+            auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
+            if(stride <= 0)
+                stride = desc.GetStrides()[0];
+            return desc;
+        }
+        else
+        {
+            auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
+            if(stride <= 0)
+                stride = desc.GetStrides()[1];
+            return desc;
+        }
+    };
 
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
index 405a2359c2..b265101f3f 100644
--- a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
@@ -15,7 +15,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -86,17 +85,14 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
-    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-        M, N, K, StrideA, StrideB, StrideC);
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
 
diff --git a/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp b/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp
index 32bdf05771..0921b48842 100644
--- a/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp
@@ -20,7 +20,6 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-#include "ck/library/utility/validation_common.hpp"
 
 namespace ck {
 namespace profiler {
@@ -86,29 +85,30 @@ bool profile_gemm_blockscale_weighpreshuffle_impl(int do_verification,
 {
     bool pass = true;
 
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
+    auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) {
+        using namespace ck::literals;
 
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
+        if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+        {
+            auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
+            if(stride <= 0)
+                stride = desc.GetStrides()[0];
+            return desc;
+        }
+        else
+        {
+            auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
+            if(stride <= 0)
+                stride = desc.GetStrides()[1];
+            return desc;
+        }
+    };
 
     ck::index_t Scale_Stride_AM = ((M + ScaleBlockM - 1) / ScaleBlockM);
     ck::index_t Scale_Stride_BN = ck::is_same_v<BLayout, ck::tensor_layout::gemm::ColumnMajor>
                                       ? ((K + ScaleBlockK - 1) / ScaleBlockK)
                                       : ((N + ScaleBlockN - 1) / ScaleBlockN);
 
-    ck::utils::validate_gemm_stride<ALayout>(M, K, StrideA, "StrideA");
-    ck::utils::validate_gemm_stride<BLayout>(K, N, StrideB, "StrideB");
-    ck::utils::validate_gemm_stride<BLayout>(M, N, StrideE, "StrideE");
-
     Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<A1DataType> a1_m_k(f_host_tensor_descriptor((M + ScaleBlockM - 1) / ScaleBlockM,
                                                        (K + ScaleBlockK - 1) / ScaleBlockK,
diff --git a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
index 3893f8cdc7..0fe8abe242 100644
--- a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
@@ -40,19 +40,24 @@ bool profile_gemm_fastgelu_impl(int do_verification,
                                 int StrideB,
                                 int StrideE)
 {
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
+    auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) {
+        using namespace ck::literals;
 
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
+        if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+        {
+            auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
+            if(stride <= 0)
+                stride = desc.GetStrides()[0];
+            return desc;
+        }
+        else
+        {
+            auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
+            if(stride <= 0)
+                stride = desc.GetStrides()[1];
+            return desc;
+        }
+    };
 
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp
index fdcb3ad128..93eac048cd 100644
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -24,7 +24,6 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/fill.hpp"
-#include "ck/library/utility/validation_common.hpp"
 
 namespace ck {
 namespace profiler {
@@ -57,17 +56,14 @@ int profile_gemm_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
-    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-        M, N, K, StrideA, StrideB, StrideC);
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_multi_abd_impl.hpp b/profiler/include/profiler/profile_gemm_multi_abd_impl.hpp
new file mode 100644
index 0000000000..46745fd02b
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_multi_abd_impl.hpp
@@ -0,0 +1,424 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+// this function is also defined in CK but because of the way we use it in
+// profile_gemm_multi_impl, it requires the arguments to not be const
+template <typename... X, typename... Y>
+auto concat_tuple_of_refs(ck::Tuple<X&...>& tx, ck::Tuple<Y&...>& ty)
+{
+    return ck::unpack2(
+        [&](auto&&... zs) { return ck::Tuple<decltype(zs)...>{ck::forward<decltype(zs)>(zs)...}; },
+        tx,
+        ty);
+}
+
+template <typename AsDataType,
+          typename BsDataType,
+          typename AccDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp>
+bool profile_gemm_multi_abd_impl(int do_verification,
+                                 int init_method,
+                                 bool /*do_log*/,
+                                 bool time_kernel,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 int StrideA,
+                                 int StrideB,
+                                 int StrideD,
+                                 int StrideE)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    static constexpr index_t NumATensor = AsDataType::Size();
+    auto as_m_k                         = generate_tuple(
+        [&](auto i) {
+            using ADataType = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+            using ALayout   = remove_cvref_t<tuple_element_t<i.value, AsLayout>>;
+
+            return Tensor<ADataType>(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+        },
+        Number<NumATensor>{});
+
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    auto bs_k_n                         = generate_tuple(
+        [&](auto i) {
+            using BDataType = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+            using BLayout   = remove_cvref_t<tuple_element_t<i.value, BsLayout>>;
+
+            return Tensor<BDataType>(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+        },
+        Number<NumBTensor>{});
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    auto ds_m_n                         = generate_tuple(
+        [&](auto i) {
+            using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+            using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+            return Tensor<DDataType>(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+        },
+        Number<NumDTensor>{});
+
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    static_for<0, NumATensor, 1>{}(
+        [&](auto i) { std::cout << "a" << i.value << "_m_k: " << as_m_k(i).mDesc << std::endl; });
+    static_for<0, NumBTensor, 1>{}(
+        [&](auto i) { std::cout << "b" << i.value << "_k_n: " << bs_k_n(i).mDesc << std::endl; });
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { std::cout << "d" << i.value << "_m_n: " << ds_m_n(i).mDesc << std::endl; });
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        static_for<0, NumATensor, 1>{}([&](auto i) {
+            using ADataType = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+
+            as_m_k(i).GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        });
+
+        static_for<0, NumBTensor, 1>{}([&](auto i) {
+            using BDataType = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+
+            bs_k_n(i).GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        });
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+            ds_m_n(i).GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        });
+
+        break;
+    default:
+        static_for<0, NumATensor, 1>{}([&](auto i) {
+            using ADataType = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+
+            as_m_k(i).GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        });
+
+        static_for<0, NumBTensor, 1>{}([&](auto i) {
+            using BDataType = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+
+            bs_k_n(i).GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        });
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+            ds_m_n(i).GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
+        });
+    }
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleABD<AsLayout,
+                                                                         BsLayout,
+                                                                         DsLayout,
+                                                                         ELayout,
+                                                                         AsDataType,
+                                                                         BsDataType,
+                                                                         DsDataType,
+                                                                         EDataType,
+                                                                         AElementOp,
+                                                                         BElementOp,
+                                                                         CDEElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using AComputeType =
+            typename std::conditional<(NumATensor > 1),
+                                      EDataType,
+                                      remove_cvref_t<tuple_element_t<0, AsDataType>>>::type;
+
+        auto get_a_matrix = [&]() -> auto {
+            // in case of pass through we avoid allocating a new
+            // tensor and copying values
+            if constexpr(is_same_v<AElementOp, PassThrough>)
+            {
+                return as_m_k(Number<0>{});
+            }
+            else
+            {
+                Tensor<AComputeType> a_m_k({M, K});
+                for(int m = 0; m < M; ++m)
+                {
+                    for(int k = 0; k < K; ++k)
+                    {
+                        // result
+                        auto data_refs1 = ck::tie(a_m_k(m, k));
+                        // inputs
+                        auto data_refs2 =
+                            generate_tie([&](auto i) -> auto& { return as_m_k(Number<i>{})(m, k); },
+                                         Number<NumATensor>{});
+                        auto data_refs = concat_tuple_of_refs(data_refs1, data_refs2);
+                        unpack(a_element_op, data_refs);
+                    }
+                }
+                return a_m_k;
+            }
+        };
+
+        using BComputeType =
+            typename std::conditional<(NumBTensor > 1),
+                                      EDataType,
+                                      remove_cvref_t<tuple_element_t<0, BsDataType>>>::type;
+
+        auto get_b_matrix = [&]() -> auto {
+            // in case of pass through we avoid allocating a new
+            // tensor and copying values
+            if constexpr(is_same_v<BElementOp, PassThrough>)
+            {
+                return bs_k_n(Number<0>{});
+            }
+            else
+            {
+                Tensor<BComputeType> b_k_n({K, N});
+                for(int k = 0; k < K; ++k)
+                {
+                    for(int n = 0; n < N; ++n)
+                    {
+                        // result
+                        auto data_refs1 = ck::tie(b_k_n(k, n));
+                        // inputs
+                        auto data_refs2 =
+                            generate_tie([&](auto i) -> auto& { return bs_k_n(Number<i>{})(k, n); },
+                                         Number<NumBTensor>{});
+                        auto data_refs = concat_tuple_of_refs(data_refs1, data_refs2);
+                        unpack(b_element_op, data_refs);
+                    }
+                }
+                return b_k_n;
+            }
+        };
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<AComputeType,
+                                                                                BComputeType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            get_a_matrix(), get_b_matrix(), c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                // compulsory
+                auto data_refs1 = ck::tie(e_m_n_host_result(m, n), c_m_n(m, n));
+                // optional (if multiple Ds)
+                auto data_refs2 =
+                    generate_tie([&](auto i) -> auto& { return ds_m_n(Number<i>{})(m, n); },
+                                 Number<NumDTensor>{});
+                auto data_refs = concat_tuple_of_refs(data_refs1, data_refs2);
+                unpack(cde_element_op, data_refs);
+            }
+        }
+    }
+
+    std::array<DeviceMem*, NumATensor> as_device_buf;
+    static_for<0, NumATensor, 1>{}([&](auto i) {
+        using ADataType  = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+        as_device_buf[i] = new DeviceMem(sizeof(ADataType) * as_m_k(i).mDesc.GetElementSpaceSize());
+    });
+
+    std::array<DeviceMem*, NumBTensor> bs_device_buf;
+    static_for<0, NumBTensor, 1>{}([&](auto i) {
+        using BDataType  = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+        bs_device_buf[i] = new DeviceMem(sizeof(BDataType) * bs_k_n(i).mDesc.GetElementSpaceSize());
+    });
+
+    std::array<DeviceMem*, NumDTensor> ds_device_buf;
+    static_for<0, NumDTensor, 1>{}([&](auto i) {
+        using DDataType  = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+        ds_device_buf[i] = new DeviceMem(sizeof(DDataType) * ds_m_n(i).mDesc.GetElementSpaceSize());
+    });
+
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    static_for<0, NumATensor, 1>{}(
+        [&](auto i) { as_device_buf[i]->ToDevice(as_m_k(i).mData.data()); });
+
+    static_for<0, NumBTensor, 1>{}(
+        [&](auto i) { bs_device_buf[i]->ToDevice(bs_k_n(i).mData.data()); });
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { ds_device_buf[i]->ToDevice(ds_m_n(i).mData.data()); });
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        std::array<const void*, NumATensor> as_pointer;
+        std::array<ck::index_t, NumATensor> as_stride;
+        static_for<0, NumATensor, 1>{}([&](auto i) {
+            as_pointer[i] = as_device_buf[i]->GetDeviceBuffer();
+            as_stride[i]  = StrideA;
+        });
+
+        std::array<const void*, NumBTensor> bs_pointer;
+        std::array<ck::index_t, NumBTensor> bs_stride;
+        static_for<0, NumBTensor, 1>{}([&](auto i) {
+            bs_pointer[i] = bs_device_buf[i]->GetDeviceBuffer();
+            bs_stride[i]  = StrideB;
+        });
+        std::array<const void*, NumDTensor> ds_pointer;
+        std::array<ck::index_t, NumDTensor> ds_stride;
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            ds_pointer[i] = ds_device_buf[i]->GetDeviceBuffer();
+            ds_stride[i]  = StrideD;
+        });
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(as_pointer,
+                                                        bs_pointer,
+                                                        ds_pointer,
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        as_stride,
+                                                        bs_stride,
+                                                        ds_stride,
+                                                        StrideE,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t sizeADataType = 0;
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                using ADataType = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+                sizeADataType   = std::max(sizeADataType, sizeof(ADataType));
+            });
+            std::size_t sizeBDataType = 0;
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                using BDataType = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+                sizeBDataType   = std::max(sizeBDataType, sizeof(BDataType));
+            });
+
+            std::size_t num_btype =
+                sizeADataType * M * K + sizeBDataType * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    static_for<0, NumATensor, 1>{}([&](auto i) { delete as_device_buf[i]; });
+
+    static_for<0, NumBTensor, 1>{}([&](auto i) { delete bs_device_buf[i]; });
+
+    static_for<0, NumDTensor, 1>{}([&](auto i) { delete ds_device_buf[i]; });
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp b/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp
index f9a5a995fe..2711d595d6 100644
--- a/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp
@@ -46,20 +46,25 @@ bool profile_gemm_multiply_add_impl(int do_verification,
                                     int StrideD1,
                                     int StrideE)
 {
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
 
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
+    auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) {
+        using namespace ck::literals;
 
+        if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+        {
+            auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
+            if(stride <= 0)
+                stride = desc.GetStrides()[0];
+            return desc;
+        }
+        else
+        {
+            auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
+            if(stride <= 0)
+                stride = desc.GetStrides()[1];
+            return desc;
+        }
+    };
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
@@ -117,6 +122,11 @@ bool profile_gemm_multiply_add_impl(int do_verification,
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp>::GetInstances();
 
+    if(op_ptrs.size() == 0)
+    {
+        std::cout << "No device operation instances found." << std::endl;
+        return false;
+    }
     std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
 
     // run reference
diff --git a/profiler/include/profiler/profile_gemm_quantization_impl.hpp b/profiler/include/profiler/profile_gemm_quantization_impl.hpp
new file mode 100644
index 0000000000..02f374164e
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_quantization_impl.hpp
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout>
+bool profile_gemm_quantization_impl(int do_verification,
+                                    int init_method,
+                                    bool do_log,
+                                    bool time_kernel,
+                                    int M,
+                                    int N,
+                                    int K,
+                                    int StrideA,
+                                    int StrideB,
+                                    int StrideE,
+                                    float requant_scale = 0.03f)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+    }
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using MulClamp    = ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>;
+
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using ActivationOp = PassThrough;
+    using CDEElementOp = MulClamp;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n));
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        std::array<const void*, 0>{},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        std::array<ck::index_t, 0>{},
+                                                        StrideE,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(
+                        std::cout << "e_m_n_device_result: ", e_m_n_device_result.mData, ",")
+                        << std::endl;
+
+                    LogRangeAsType<float>(
+                        std::cout << "e_m_n_host_result: ", e_m_n_host_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
index a74d2a01d9..74a1b60fe3 100644
--- a/profiler/include/profiler/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
@@ -15,7 +15,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -71,7 +70,8 @@ bool profile_gemm_reduce_impl(int do_verification,
                               int K,
                               int StrideA,
                               int StrideB,
-                              int StrideC)
+                              int StrideC,
+                              int instance_index = -1)
 {
     bool pass = true;
 
@@ -81,17 +81,14 @@ bool profile_gemm_reduce_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
-    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-        M, N, K, StrideA, StrideB, StrideC);
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
 
@@ -253,7 +250,7 @@ bool profile_gemm_reduce_impl(int do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device GEMM instances
     for(auto& gemm_ptr : gemm_ptrs)
     {
@@ -279,6 +276,12 @@ bool profile_gemm_reduce_impl(int do_verification,
 
         if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // init DO, D1 to 0
             reduce0_device_buf.SetZero();
             reduce1_device_buf.SetZero();
@@ -349,7 +352,11 @@ bool profile_gemm_reduce_impl(int do_verification,
 
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "gemm_reduce_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
index 0640e95aba..744db27675 100644
--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -19,7 +19,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -45,7 +44,8 @@ bool profile_gemm_splitk_impl(int do_verification,
                               int StrideC,
                               int KBatch,
                               int n_warmup,
-                              int n_iter)
+                              int n_iter,
+                              int instance_index = -1)
 {
     bool pass = true;
 
@@ -55,17 +55,14 @@ bool profile_gemm_splitk_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
-    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-        M, N, K, StrideA, StrideB, StrideC);
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
@@ -145,6 +142,7 @@ bool profile_gemm_splitk_impl(int do_verification,
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
     float best_kbatch     = 0;
+    int num_kernel        = 0;
 
     // profile device GEMM instances
     for(auto& op_ptr : op_ptrs)
@@ -179,7 +177,12 @@ bool profile_gemm_splitk_impl(int do_verification,
 
             if(op_ptr->IsSupportedArgument(argument_ptr.get()))
             {
-
+                ++num_kernel;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
                 // re-init C to zero before profiling next kernel
                 c_device_buf.SetZero();
 
@@ -298,7 +301,11 @@ bool profile_gemm_splitk_impl(int do_verification,
               << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
               << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
               << " GB/s, " << best_op_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "gemm_splitk_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_gemm_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_streamk_impl.hpp
index d24ee1c7ea..f86e7ad447 100644
--- a/profiler/include/profiler/profile_gemm_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_streamk_impl.hpp
@@ -19,7 +19,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -52,17 +51,14 @@ bool profile_gemm_streamk_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
-    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-        M, N, K, StrideA, StrideB, StrideC);
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
index f4300af8d8..99e24cd205 100644
--- a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
@@ -65,11 +65,13 @@ bool profile_gemm_universal_batched_impl(int do_verification,
 
         if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
         }
         else
         {
-            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
         }
     };
 
diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp
index feb75c9660..bb73c4e3da 100644
--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -19,7 +19,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -56,17 +55,14 @@ bool profile_gemm_universal_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
-    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-        M, N, K, StrideA, StrideB, StrideC);
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp b/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp
index 271bc6ef59..e537cf2770 100644
--- a/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp
@@ -19,7 +19,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -84,17 +83,14 @@ bool profile_gemm_universal_preshuffle_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
-    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-        M, N, K, StrideA, StrideB, StrideC);
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp
index a0ee6a6674..554956ee88 100644
--- a/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp
@@ -10,6 +10,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3r1.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/gemm_universal_reduce.hpp"
@@ -19,7 +20,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -57,17 +57,14 @@ bool profile_gemm_universal_reduce_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
-    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-        M, N, K, StrideA, StrideB, StrideC);
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
@@ -86,10 +83,21 @@ bool profile_gemm_universal_reduce_impl(int do_verification,
 
     switch(init_method)
     {
-    case 0: break;
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
     case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 2});
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
         break;
     default:
         a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
index 5c859b830d..035a1b77df 100644
--- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
@@ -21,7 +21,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
 
@@ -60,17 +59,14 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
-    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-        M, N, K, StrideA, StrideB, StrideC);
-
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index 0aeefaabfb..a7c6717f58 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -35,7 +35,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                                         bool do_log,
                                         bool time_kernel,
                                         const ck::utils::conv::ConvParam& conv_param,
-                                        ck::index_t split_k = 1)
+                                        ck::index_t split_k    = 1,
+                                        index_t instance_index = -1)
 {
     using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -123,9 +124,9 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     ck::index_t best_split_k = 1;
 
     // profile device op instances
-    bool pass = true;
-
-    auto run_impl = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) {
+    bool pass          = true;
+    index_t num_kernel = 0;
+    auto run_impl      = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) {
         // workspace_sz will be equal to 0 for other layout than NGCHW
         const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
         DeviceMem workspace_dev(workspace_sz);
@@ -133,6 +134,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                return;
+            }
             std::string op_name = op_ptr->GetTypeString();
 
             auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -165,8 +172,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                 in_device_buf.FromDevice(in_device.mData.data());
 
                 using ComputeType = std::conditional_t<sizeof(OutDataType) < sizeof(WeiDataType),
-                                                       OutDataType,
-                                                       WeiDataType>;
+                                                            OutDataType,
+                                                            WeiDataType>;
                 using AccDataType =
                     std::conditional_t<std::is_same_v<ComputeType, int8_t>, int32_t, float>;
                 const index_t num_accums = conv_param.K_;
@@ -185,11 +192,17 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                 // Use higher threshold
                 rtol = std::max(rtol, rtol_split_k);
                 atol = std::max(atol, atol_split_k);
-
-                pass &= ck::utils::check_err(
-                    in_device, in_host, "Error: Incorrect results!", rtol, atol);
-                std::cout << "Relative error threshold: " << rtol
-                          << " Absolute error threshold: " << atol << std::endl;
+                if(split_k_for_run > 1)
+                {
+                    pass &= ck::utils::check_err(
+                        in_device, in_host, "Error: Incorrect results!", rtol, atol);
+                    std::cout << "Relative error threshold: " << rtol
+                              << " Absolute error threshold: " << atol << std::endl;
+                }
+                else
+                {
+                    pass &= ck::utils::check_err(in_device, in_host, "Error: Incorrect results!");
+                }
 
                 if(do_log)
                 {
@@ -291,6 +304,11 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 479fed78e7..6654275fd0 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -41,7 +41,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                                           bool do_log,
                                           bool time_kernel,
                                           const ck::utils::conv::ConvParam& conv_param,
-                                          const std::string& split_k)
+                                          const std::string& split_k,
+                                          index_t instance_index = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -187,6 +188,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
         }
     }
 
+    index_t num_kernel = 0;
     for(auto& op_ptr : op_ptrs)
     {
         for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
@@ -226,6 +228,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
 
             if(op_ptr->IsSupportedArgument(argument_ptr.get()))
             {
+                num_kernel++;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
 
                 std::string op_name = op_ptr->GetTypeString();
 
@@ -326,6 +334,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_bwd_weight_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return all_pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
index cd6c141219..2f7f3ae4d8 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
@@ -32,6 +32,7 @@ using OutElementOp = ck::tensor_operation::element_wise::BiasNormalizeInInferCla
 using Clamp        = ck::tensor_operation::element_wise::Clamp;
 using Add          = ck::tensor_operation::element_wise::Add;
 
+using BaseConv = ck::tensor_layout::convolution::BaseConvolutionLayout;
 // NOTE: Usage of NHWGK layout for GK bias is a workaround. This test is to
 // just keep such implementation valid.
 // TODO: Add possiblity to pass GK layout and GK lengths for bias and reuse
@@ -42,15 +43,15 @@ auto get_elementwise_desc(ck::index_t G, ck::index_t K)
 {
     if constexpr(NDimSpatial == 1)
     {
-        return HostTensorDescriptor({G, 1, K, 1}, {K, 0, 1, 0});
+        return HostTensorDescriptor({G, 1, K, 1}, {K, 0, 1, 0}, BaseConv{});
     }
     else if constexpr(NDimSpatial == 2)
     {
-        return HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0});
+        return HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0}, BaseConv{});
     }
     else
     {
-        return HostTensorDescriptor({G, 1, K, 1, 1, 1}, {K, 0, 1, 0, 0, 0});
+        return HostTensorDescriptor({G, 1, K, 1, 1, 1}, {K, 0, 1, 0, 0, 0}, BaseConv{});
     }
 }
 
@@ -125,7 +126,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                               int init_method,
                                               bool do_log,
                                               bool time_kernel,
-                                              const ck::utils::conv::ConvParam& conv_param)
+                                              const ck::utils::conv::ConvParam& conv_param,
+                                              int instance_index = -1)
 {
     const float floor   = 0.f;
     const float ceil    = 2048.f;
@@ -294,6 +296,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
+    int num_kernel        = 0;
 
     // profile device op instances
     bool pass = true;
@@ -306,6 +309,13 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                std::cout << op_ptr->GetTypeString() << " skipped" << std::endl;
+                return;
+            }
             // re-init output to zero before profiling next kernel
             out_device_buf.SetZero();
 
@@ -419,7 +429,11 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_fwd_bias_bnorm_clamp_instance (" << instance_index << "/"
+                  << num_kernel << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
index d0e1cf2611..2dbadd8eb1 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
@@ -25,6 +25,8 @@
 namespace ck {
 namespace profiler {
 
+using BaseConv = ck::tensor_layout::convolution::BaseConvolutionLayout;
+
 // NOTE: Usage of NHWGK layout for GK bias is a workaround. This test is to
 // just keep such implementation valid.
 // TODO: Add possiblity to pass GK layout and GK lengths for bias and reuse
@@ -35,15 +37,15 @@ auto get_bias_desc(ck::index_t G, ck::index_t K)
 {
     if constexpr(NDimSpatial == 1)
     {
-        return HostTensorDescriptor({G, 1, K, 1}, {K, 0, 1, 0});
+        return HostTensorDescriptor({G, 1, K, 1}, {K, 0, 1, 0}, BaseConv{});
     }
     else if constexpr(NDimSpatial == 2)
     {
-        return HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0});
+        return HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0}, BaseConv{});
     }
     else
     {
-        return HostTensorDescriptor({G, 1, K, 1, 1, 1}, {K, 0, 1, 0, 0, 0});
+        return HostTensorDescriptor({G, 1, K, 1, 1, 1}, {K, 0, 1, 0, 0, 0}, BaseConv{});
     }
 }
 
@@ -62,7 +64,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                               int init_method,
                                               bool do_log,
                                               bool time_kernel,
-                                              const ck::utils::conv::ConvParam& conv_param)
+                                              const ck::utils::conv::ConvParam& conv_param,
+                                              int instance_index = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -192,7 +195,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     bool pass = true;
 
@@ -204,6 +207,13 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                std::cout << op_ptr->GetTypeString() << " skipped" << std::endl;
+                return;
+            }
             // re-init output to zero before profiling next kernel
             out_device_buf.SetZero();
 
@@ -315,7 +325,11 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_fwd_bias_clamp_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index 2dcee4c1fc..d490cf4167 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -42,7 +42,8 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
                                    bool do_log,
                                    bool time_kernel,
                                    const ck::utils::conv::ConvParam& conv_param,
-                                   const OutElementOp out_element_op = OutElementOp{})
+                                   const OutElementOp out_element_op = OutElementOp{},
+                                   index_t instance_index            = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -144,7 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    index_t num_kernel    = 0;
     // profile device op instances
     bool pass = true;
 
@@ -156,6 +157,13 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                return;
+            }
+
             std::string op_name = op_ptr->GetTypeString();
 
             auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -253,7 +261,11 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_fwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index fc2ba5a650..8314b9053f 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -44,7 +44,8 @@ bool profile_grouped_gemm_impl(int do_verification,
                                const std::vector<int>& StrideCs,
                                const std::vector<int>& kbatches = {},
                                int n_warmup                     = 1,
-                               int n_iter                       = 10)
+                               int n_iter                       = 10,
+                               int instance_index               = -1)
 {
     bool pass = true;
     // TODO: Fixme - we do not pass compute data type here but need it
@@ -57,11 +58,11 @@ bool profile_grouped_gemm_impl(int do_verification,
 
             if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
             {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
             }
             else
             {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
             }
         };
 
@@ -195,8 +196,8 @@ bool profile_grouped_gemm_impl(int do_verification,
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
     float best_kbatch     = 0;
-
-    auto p_ds = std::vector<std::array<const void*, 0>>{};
+    int num_kernel        = 0;
+    auto p_ds             = std::vector<std::array<const void*, 0>>{};
 
     if(do_verification)
     {
@@ -279,6 +280,13 @@ bool profile_grouped_gemm_impl(int do_verification,
 
             if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
             {
+                ++num_kernel;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
+
                 for(std::size_t i = 0; i < gemm_descs.size(); i++)
                     c_device_buf[i]->SetZero();
 
@@ -371,7 +379,11 @@ bool profile_grouped_gemm_impl(int do_verification,
                   << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch
                   << std::endl;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_gemm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
index 55ea08e0db..c1647815ad 100644
--- a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
@@ -26,7 +26,8 @@ bool profile_groupnorm_bwd_data_impl(int do_verification,
                                      int init_method,
                                      bool do_log,
                                      bool time_kernel,
-                                     std::vector<index_t> length)
+                                     std::vector<index_t> length,
+                                     index_t instance_index = -1)
 {
     // we don't need DGamma and DBeta here, just for reference class
     using DGammaDataType = DXDataType;
@@ -162,6 +163,11 @@ bool profile_groupnorm_bwd_data_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -242,7 +248,11 @@ bool profile_groupnorm_bwd_data_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "groupnorm_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp
index d0a5032bff..60982d18d5 100644
--- a/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp
@@ -29,7 +29,8 @@ bool profile_groupnorm_impl(int do_verification,
                             int init_method,
                             bool do_log,
                             bool time_kernel,
-                            std::vector<index_t> length)
+                            std::vector<index_t> length,
+                            index_t instance_index = -1)
 {
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
@@ -178,6 +179,11 @@ bool profile_groupnorm_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -267,6 +273,12 @@ bool profile_groupnorm_impl(int do_verification,
         return false;
     }
 
+    if(instance_index != -1)
+    {
+        std::cout << "groupnorm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
+
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
index e88a06122d..7704085048 100644
--- a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
@@ -27,7 +27,8 @@ bool profile_layernorm_bwd_data_impl(int do_verification,
                                      int init_method,
                                      bool do_log,
                                      bool time_kernel,
-                                     std::vector<index_t> length)
+                                     std::vector<index_t> length,
+                                     index_t instance_index = -1)
 {
     // we don't need DGamma and DBeta here, just for reference class
     using DGammaDataType = DXDataType;
@@ -167,6 +168,11 @@ bool profile_layernorm_bwd_data_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -247,7 +253,11 @@ bool profile_layernorm_bwd_data_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "layernorm_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
index 10fa9c86d5..e36b20e1b5 100644
--- a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
@@ -27,7 +27,8 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
                                            int init_method,
                                            bool do_log,
                                            bool time_kernel,
-                                           std::vector<index_t> length)
+                                           std::vector<index_t> length,
+                                           index_t instance_index = -1)
 {
     // we don't need GammaDataType and DXDataType here, just for reference class
     using GammaDataType = DYDataType;
@@ -178,6 +179,11 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -255,7 +261,11 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "layernorm_bwd_gamma_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_layernorm_fwd_impl.hpp b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp
index 66272b6eff..51dcbb1275 100644
--- a/profiler/include/profiler/profile_layernorm_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp
@@ -28,7 +28,8 @@ bool profile_layernorm_impl(int do_verification,
                             int init_method,
                             bool do_log,
                             bool time_kernel,
-                            std::vector<index_t> length)
+                            std::vector<index_t> length,
+                            index_t instance_index = -1)
 {
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
@@ -188,6 +189,11 @@ bool profile_layernorm_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -286,6 +292,12 @@ bool profile_layernorm_impl(int do_verification,
         return false;
     }
 
+    if(instance_index != -1)
+    {
+        std::cout << "layernorm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
+
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
index 7a712f21f2..a8efee3ef0 100644
--- a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
@@ -34,7 +34,8 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
                                  std::vector<index_t> window_strides,
                                  std::vector<index_t> window_dilations,
                                  std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
     // AtomicAdd only support f32 for now. ComputeDataType must be float32
     using ComputeDataType = float;
@@ -82,7 +83,9 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
         [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
             using namespace ck::literals;
 
-            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+            return HostTensorDescriptor({N_, C_, H, W},
+                                        {C_ * H * W, 1_uz, W * C_, C_},
+                                        ck::tensor_layout::convolution::NCHW{});
         };
 
     Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
@@ -197,6 +200,11 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
         {
             ++num_kernel;
             instance_found = true;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -287,7 +295,11 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool2d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass && instance_found;
 }
 
diff --git a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
index 15fb4e9034..cf6050969f 100644
--- a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
@@ -34,7 +34,8 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
                                  std::vector<index_t> window_strides,
                                  std::vector<index_t> window_dilations,
                                  std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
     // AtomicAdd only support f32 for now. ComputeDataType must be float32
     using ComputeDataType = float;
@@ -84,7 +85,8 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
             using namespace ck::literals;
 
             return HostTensorDescriptor({N_, C_, D, H, W},
-                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_},
+                                        ck::tensor_layout::convolution::NDHWC{});
         };
 
     Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi));
@@ -192,6 +194,11 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -280,7 +287,11 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool3d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_permute_scale_impl.hpp b/profiler/include/profiler/profile_permute_scale_impl.hpp
index 186a24501e..9ccbd67783 100644
--- a/profiler/include/profiler/profile_permute_scale_impl.hpp
+++ b/profiler/include/profiler/profile_permute_scale_impl.hpp
@@ -40,10 +40,13 @@ bool profile_permute_scale_impl(int do_verification,
     using ElementOp = ck::tensor_operation::element_wise::Scale;
     float scale     = 2.f;
 
-    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(lengths_vector, input_strides_vector)};
-    Tensor<ADataType>& a                = as[0];
-    Tensor<BDataType> b(lengths_vector, output_strides_vector);
-    Tensor<BDataType> host_b(lengths_vector, output_strides_vector);
+    using ALayout                       = ck::tensor_layout::BypassLayoutVerification;
+    using BLayout                       = ck::tensor_layout::BypassLayoutVerification;
+    std::array<Tensor<ADataType>, 1> as = {
+        Tensor<ADataType>(lengths_vector, input_strides_vector, ALayout{})};
+    Tensor<ADataType>& a = as[0];
+    Tensor<BDataType> b(lengths_vector, output_strides_vector, BLayout{});
+    Tensor<BDataType> host_b(lengths_vector, output_strides_vector, BLayout{});
 
     std::cout << "A: " << a.mDesc << std::endl;
     std::cout << "B: " << b.mDesc << std::endl;
diff --git a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
index 23226a4881..962be4448c 100644
--- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
@@ -35,7 +35,8 @@ bool profile_pool2d_fwd_impl(int do_verification,
                              std::vector<index_t> window_strides,
                              std::vector<index_t> window_dilations,
                              std::vector<index_t> input_left_pads,
-                             std::vector<index_t> input_right_pads)
+                             std::vector<index_t> input_right_pads,
+                             index_t instance_index = -1)
 {
     constexpr index_t InOutRank  = 4;
     constexpr index_t WindowRank = 2;
@@ -74,7 +75,9 @@ bool profile_pool2d_fwd_impl(int do_verification,
         [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
             using namespace ck::literals;
 
-            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+            return HostTensorDescriptor({N_, C_, H, W},
+                                        {C_ * H * W, 1_uz, W * C_, C_},
+                                        ck::tensor_layout::convolution::NCHW{});
         };
 
     Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
@@ -169,6 +172,11 @@ bool profile_pool2d_fwd_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -266,7 +274,11 @@ bool profile_pool2d_fwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool2d_fwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
index cbdacad53b..e1d0c1573d 100644
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -46,7 +46,9 @@ template <typename InDataType,
           ck::ReduceTensorOp ReduceOpId,
           bool PropagateNan,
           bool OutputIndex>
-bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& kernel_params)
+bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params,
+                             PoolFwdKernelParams& kernel_params,
+                             index_t instance_index = -1)
 {
     constexpr index_t InOutRank  = 5;
     constexpr index_t WindowRank = 3;
@@ -91,7 +93,8 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
             using namespace ck::literals;
 
             return HostTensorDescriptor({N_, C_, D, H, W},
-                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_},
+                                        ck::tensor_layout::convolution::NDHWC{});
         };
 
     Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi));
@@ -198,6 +201,11 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -327,7 +335,11 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool3d_fwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp
index b54aa65aef..14a93af69d 100644
--- a/profiler/include/profiler/profile_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_reduce_impl.hpp
@@ -144,7 +144,8 @@ bool profile_reduce_impl_impl(bool do_verification,
                               const std::vector<size_t>& inLengths,
                               const std::array<int, NumReduceDim>& reduceDims,
                               float alpha,
-                              float beta)
+                              float beta,
+                              index_t instance_index = -1)
 {
     using namespace ck::tensor_operation::device;
     using namespace ck::tensor_operation::device::instance;
@@ -373,7 +374,14 @@ bool profile_reduce_impl_impl(bool do_verification,
             if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
                 continue;
             else
+            {
                 num_kernel++;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
+            }
 
             std::string reduce_name = reduce_ptr->GetTypeString();
 
@@ -452,7 +460,11 @@ bool profile_reduce_impl_impl(bool do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     };
-
+    if(instance_index != -1)
+    {
+        std::cout << "reduce_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 };
 
@@ -467,7 +479,8 @@ bool profile_reduce_impl(bool do_verification,
                          bool PropagateNan,
                          bool UseIndex,
                          float alpha,
-                         float beta)
+                         float beta,
+                         index_t instance_index = -1)
 {
     bool matched = false;
     bool pass    = true;
@@ -505,7 +518,8 @@ bool profile_reduce_impl(bool do_verification,
                                                                      inLengths,
                                                                      arrReduceDims,
                                                                      alpha,
-                                                                     beta);
+                                                                     beta,
+                                                                     instance_index);
 
         matched = true;
     });
diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp
index 83913d8398..d7a790803a 100644
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -53,7 +53,8 @@ bool profile_softmax_impl(int do_verification,
                           std::vector<index_t> in_strides,
                           std::vector<index_t> reduce_dims,
                           double alpha,
-                          double beta)
+                          double beta,
+                          index_t instance_index = -1)
 {
     if(Rank != in_length.size())
     {
@@ -124,7 +125,7 @@ bool profile_softmax_impl(int do_verification,
     float best_avg_time   = std::numeric_limits<float>::max();
     float best_gb_per_sec = 0;
     std::vector<bool> instance_pass;
-
+    index_t num_kernel = 0;
     for(auto& inst_ptr : instances)
     {
         auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
@@ -146,6 +147,15 @@ bool profile_softmax_impl(int do_verification,
             instance_pass.push_back(true);
             continue;
         }
+        else
+        {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+        }
 
         out_dev.ToDevice(prior_out.data());
         auto invoker_ptr = inst_ptr->MakeInvokerPointer();
@@ -216,6 +226,11 @@ bool profile_softmax_impl(int do_verification,
         std::cout << "alpha = " << alpha << ", " << "beta = " << beta << ", " << best_avg_time
                   << " ms, " << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
     }
+    if(instance_index != -1)
+    {
+        std::cout << "reduce_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return std::all_of(
         std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
 }
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 4700a34e9d..c31ede2c73 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -32,6 +32,7 @@ set(PROFILER_OPS
     profile_conv_tensor_rearrange.cpp
     profile_transpose.cpp
     profile_permute_scale.cpp
+    profile_gemm_quantization.cpp
 )
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
@@ -41,25 +42,18 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   endif()
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
     list(APPEND PROFILER_OPS profile_gemm_reduce.cpp)
-    list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp)
     list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp)
     list(APPEND PROFILER_OPS profile_gemm_add.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp)
     list(APPEND PROFILER_OPS profile_grouped_gemm.cpp)
     list(APPEND PROFILER_OPS profile_gemm_streamk.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp)
     list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp)
     list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp)
     list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp)
     list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp)
     list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp)
     list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp)
   endif()
-  list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp)
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]")
-    list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp)
     list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
     list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
     list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp)
@@ -70,11 +64,11 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   endif()
   list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp)
   list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_add.cpp)
   list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
   list(APPEND PROFILER_OPS profile_gemm_splitk.cpp)
   list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
   list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
   list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp)
   list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp)
   list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp)
@@ -85,19 +79,34 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
 endif()
 
 if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
-   (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" AND (DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)))
+   (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]"))
   list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp)
 endif()
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
+  list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp)
+endif()
 
-if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")
   list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
   list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
   list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp)
+  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
+  endif()
+  list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp)
 endif()
 
 if(DL_KERNELS)
@@ -105,6 +114,10 @@ if(DL_KERNELS)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
 endif()
 
+if(CK_ENABLE_INT8)
+  list(APPEND PROFILER_OPS profile_gemm_quantization.cpp)
+endif()
+
 set(PROFILER_SOURCES profiler.cpp)
 foreach(SOURCE ${PROFILER_OPS})
   string(REGEX REPLACE "profile_(.+)\.cpp" "\\1" OP_NAME ${SOURCE})
@@ -146,35 +159,33 @@ list(APPEND DEVICE_INSTANCES device_column_to_image_instance)
 list(APPEND DEVICE_INSTANCES device_transpose_instance)
 list(APPEND DEVICE_INSTANCES device_permute_scale_instance)
 
-if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")
   if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
     list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance)
     list(APPEND DEVICE_INSTANCES device_contraction_scale_instance)
   endif()
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
     list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
+    list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
-    list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
     list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance)
     list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance)
     list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance)
     list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance)
     list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance)
   endif()
   list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance)
-  if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]")
-    list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance)
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
     list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance)
+  endif()
+    if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")
     list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance)
   endif()
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
     list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
@@ -182,9 +193,9 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
   list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance)
   list(APPEND DEVICE_INSTANCES device_conv2d_fwd_instance)
@@ -206,19 +217,35 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
 endif()
 
 if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
-   (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" AND (DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)))
+   (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" ))
   list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance)
 endif()
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
+  list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance)
+endif()
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")
   list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
   list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance)
+  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
+  endif()
+  list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
 endif()
 
 if(DL_KERNELS)
@@ -228,6 +255,10 @@ if(DL_KERNELS)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
 endif()
 
+if(CK_ENABLE_INT8)
+  list(APPEND DEVICE_INSTANCES device_quantization_instance)
+endif()
+
 set(PROFILER_LIBS utility getopt::getopt)
 foreach(LIB ${DEVICE_INSTANCES})
   string(REGEX REPLACE "device_(.+)_instance" "\\1" INSTANCE_NAME ${LIB})
diff --git a/profiler/src/profile_gemm_add_multiply.cpp b/profiler/src/profile_gemm_add_multiply.cpp
index 560467c264..f8ec7abb66 100644
--- a/profiler/src/profile_gemm_add_multiply.cpp
+++ b/profiler/src/profile_gemm_add_multiply.cpp
@@ -35,10 +35,10 @@ int profile_gemm_add_multiply(int argc, char* argv[])
         // clang-format off
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
-        printf("arg3: matrix layout (0: E[m, n] = AddMultiply((A[m, k] * B[k, n] + D0[m, n]) x D1[m, n]);\n");
-        printf("                     1: E[m, n] = AddMultiply((A[m, k] * B[k, n] + D0[m, n]) x D1[m, n]);\n");
-        printf("                     2: E[m, n] = AddMultiply((A[m, k] * B[k, n] + D0[m, n]) x D1[m, n]);\n");
-        printf("                     3: E[m, n] = AddMultiply((A[m, k] * B[k, n] + D0[m, n]) x D1[m, n]))\n");
+        printf("arg3: matrix layout (0: E[m, n] = (A[m, k] * B[k, n] + D0[m, n]) x D1[m, n];\n");
+        printf("                     1: E[m, n] = (A[m, k] * B[n, k] + D0[m, n]) x D1[m, n];\n");
+        printf("                     2: E[m, n] = (A[k, m] * B[k, n] + D0[m, n]) x D1[m, n];\n");
+        printf("                     3: E[m, n] = (A[k, m] * B[n, k] + D0[m, n]) x D1[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg6: print tensor value (0: no; 1: yes)\n");
diff --git a/profiler/src/profile_gemm_add_silu.cpp b/profiler/src/profile_gemm_add_silu.cpp
index daaaef0fa2..a35c0a4092 100644
--- a/profiler/src/profile_gemm_add_silu.cpp
+++ b/profiler/src/profile_gemm_add_silu.cpp
@@ -27,15 +27,17 @@ int profile_gemm_add_silu(int argc, char* argv[])
 
     enum struct MatrixDataType
     {
-        F16_INT8_F16_F16,    // 0
-        BF16_INT8_BF16_BF16, // 1
+        F16_INT8_F16_F16    = 0,
+        BF16_INT8_BF16_BF16 = 1,
+        F16_F16_F16_F16     = 2,
+        BF16_BF16_BF16_BF16 = 3
     };
 
     if(argc != 15)
     {
         // clang-format off
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
-        printf("arg2: data type (0: f16&i8 1: bf16&i8)\n");
+        printf("arg2: data type (0: f16&i8 1: bf16&i8 2: f16&f16 3: bf16&bf16)\n");
         printf("arg3: matrix layout (0: E[m, n] = ReLU(A[m, k] * B[k, n] + D0[m, n]);\n");
         printf("                     1: E[m, n] = ReLU(A[m, k] * B[n, k] + D0[m, n]);\n");
         printf("                     2: E[m, n] = ReLU(A[k, m] * B[k, n] + D0[m, n]);\n");
@@ -128,6 +130,14 @@ int profile_gemm_add_silu(int argc, char* argv[])
     {
         return profile(BF16{}, INT8{}, F32{}, BF16{}, BF16{}, Row{}, Row{}, Row{}, Row{});
     }
+    else if(data_type == MatrixDataType::BF16_BF16_BF16_BF16 && layout == MatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, BF16{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{});
+    }
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/profiler/src/profile_gemm_multi_abd.cpp b/profiler/src/profile_gemm_multi_abd.cpp
new file mode 100644
index 0000000000..157bcbc977
--- /dev/null
+++ b/profiler/src/profile_gemm_multi_abd.cpp
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_multi_abd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    BF16_I8_BF16_BF16, // 0
+};
+
+enum struct GemmElementOp
+{
+    PASS_THROUGH,          // 0
+    MULTIPLY,              // 1
+    ADD,                   // 2
+    FASTGELU,              // 3
+    ADD_FASTGELU,          // 4
+    MULTIPLY_ADD,          // 5
+    MULTIPLY_FASTGELU,     // 6
+    MULTIPLY_ADD_FASTGELU, // 7
+};
+
+#define OP_NAME "gemm_multi_abd"
+#define OP_DESC "GEMM_Multiple_ABD"
+
+int profile_gemm_multi_abd(int argc, char* argv[])
+{
+    if(argc != 18)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: bf16@int8/bf16->bf16;)\n");
+        printf("arg3: matrix layout (0: E[m, n] = A[m, k] * B[k, n];\n");
+        printf("                     1: E[m, n] = A[m, k] * B[n, k];\n");
+        printf("                     2: E[m, n] = A[k, m] * B[k, n];\n");
+        printf("                     3: E[m, n] = A[k, m] * B[n, k])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8: number of As (1)\n");
+        printf("arg9: number of Bs (1/2)\n");
+        printf("arg10: number of Ds (0/1/2)\n");
+        printf("arg11 to 17: M, N, K, StrideA, StrideB, StrideE, StrideD\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int num_as = std::stoi(argv[8]);
+    const int num_bs = std::stoi(argv[9]);
+    const int num_ds = std::stoi(argv[10]);
+
+    const int M = std::stoi(argv[11]);
+    const int N = std::stoi(argv[12]);
+    const int K = std::stoi(argv[13]);
+
+    const int StrideA = std::stoi(argv[14]);
+    const int StrideB = std::stoi(argv[15]);
+    const int StrideE = std::stoi(argv[16]);
+    const int StrideD = std::stoi(argv[17]);
+
+    using F32  = float;
+    using BF16 = ck::bhalf_t;
+    using I8   = int8_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Multiply    = ck::tensor_operation::element_wise::Multiply;
+    using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+    using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+    auto profile = [&](auto b_layout, auto b_element_op, auto cde_element_op, auto num_d_tensor) {
+        using ADataType  = BF16;
+        using B0DataType = I8;
+        using B1DataType = BF16;
+        using DDataType  = BF16;
+        using EDataType  = BF16;
+
+        using ALayout = Row;
+        using BLayout = decltype(b_layout);
+        using DLayout = Row;
+        using ELayout = Row;
+
+        using AElementOp         = PassThrough;
+        using BElementOp         = decltype(b_element_op);
+        using CDEElementOp       = decltype(cde_element_op);
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideD = ck::is_same_v<DLayout, Row> ? N : M;
+        const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
+
+        constexpr auto NumberDTensor = decltype(num_d_tensor){};
+
+        // Only num_d_tensor == 0 and 1 are supported
+        using DsDataType = typename std::
+            conditional<(NumberDTensor == 0), ck::Tuple<>, ck::Tuple<DDataType>>::type;
+        using DsLayout =
+            typename std::conditional<(NumberDTensor == 0), ck::Tuple<>, ck::Tuple<DLayout>>::type;
+
+        bool pass = ck::profiler::profile_gemm_multi_abd_impl<ck::Tuple<ADataType>,
+                                                              ck::Tuple<B0DataType, B1DataType>,
+                                                              F32,
+                                                              DsDataType,
+                                                              EDataType,
+                                                              ck::Tuple<ALayout>,
+                                                              ck::Tuple<BLayout, BLayout>,
+                                                              DsLayout,
+                                                              ELayout,
+                                                              AElementOp,
+                                                              BElementOp,
+                                                              CDEElementOp>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideD < 0) ? DefaultStrideD : StrideD,
+            (StrideE < 0) ? DefaultStrideE : StrideE);
+
+        return pass ? 0 : 1;
+    };
+
+    // num_as == 1 is only supported
+    if(data_type != GemmDataType::BF16_I8_BF16_BF16 || num_as != 1)
+    {
+        std::cout << "The provided input parameters are not supported" << std::endl;
+        return 1;
+    }
+
+    // Supported configurations
+    if(layout == GemmMatrixLayout::MK_KN_MN && num_bs == 2 && num_ds == 1)
+    {
+        return profile(Row{}, Multiply{}, AddFastGelu{}, ck::Number<1>{});
+    }
+    else if(layout == GemmMatrixLayout::MK_KN_MN && num_bs == 2 && num_ds == 0)
+    {
+        return profile(Row{}, Multiply{}, FastGelu{}, ck::Number<0>{});
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_MN && num_bs == 2 && num_ds == 1)
+    {
+        return profile(Col{}, Multiply{}, AddFastGelu{}, ck::Number<1>{});
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_MN && num_bs == 2 && num_ds == 0)
+    {
+        return profile(Col{}, Multiply{}, FastGelu{}, ck::Number<0>{});
+    }
+
+    std::cout << "The provided input parameters are not supported" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_multi_abd);
diff --git a/profiler/src/profile_gemm_multiply_add.cpp b/profiler/src/profile_gemm_multiply_add.cpp
index 98973b2f01..88d3b5256a 100644
--- a/profiler/src/profile_gemm_multiply_add.cpp
+++ b/profiler/src/profile_gemm_multiply_add.cpp
@@ -92,12 +92,6 @@ int profile_gemm_multiply_add(int argc, char* argv[])
         using D1Layout = decltype(d1_layout);
         using ELayout  = decltype(e_layout);
 
-        const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
-        const int DefaultStrideB  = ck::is_same_v<BLayout, Row> ? N : K;
-        const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
-        const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
-        const int DefaultStrideE  = ck::is_same_v<ELayout, Row> ? N : M;
-
         bool pass = ck::profiler::profile_gemm_multiply_add_impl<ADataType,
                                                                  BDataType,
                                                                  AccDataType,
@@ -108,19 +102,18 @@ int profile_gemm_multiply_add(int argc, char* argv[])
                                                                  BLayout,
                                                                  D0Layout,
                                                                  D1Layout,
-                                                                 ELayout>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            M,
-            N,
-            K,
-            (StrideA < 0) ? DefaultStrideA : StrideA,
-            (StrideB < 0) ? DefaultStrideB : StrideB,
-            (StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
-            (StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
-            (StrideE < 0) ? DefaultStrideE : StrideE);
+                                                                 ELayout>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          StrideA,
+                                                                          StrideB,
+                                                                          StrideD0,
+                                                                          StrideD1,
+                                                                          StrideE);
 
         return pass ? 0 : 1;
     };
diff --git a/profiler/src/profile_gemm_multiply_multiply.cpp b/profiler/src/profile_gemm_multiply_multiply.cpp
index 42192b5985..87424c21a2 100644
--- a/profiler/src/profile_gemm_multiply_multiply.cpp
+++ b/profiler/src/profile_gemm_multiply_multiply.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -92,9 +92,14 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
     using F32  = float;
     using BF16 = ck::bhalf_t;
     using F16  = ck::half_t;
-    using F8   = ck::f8_t;
-    using I8   = int8_t;
-    using I32  = int;
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || CK_USE_OCP_FP8 || defined(CK_USE_GFX94) || \
+    defined(CK_USE_WMMA_FP8)
+    using F8 = ck::f8_t;
+#endif
+#ifdef CK_ENABLE_INT8
+    using I8  = int8_t;
+    using I32 = int;
+#endif
 
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -162,33 +167,32 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
 
         return pass ? 0 : 1;
     };
-
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || CK_USE_OCP_FP8 || defined(CK_USE_GFX94) || \
+    defined(CK_USE_WMMA_FP8)
     if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         return profile(
             F8{}, F8{}, F8{}, F32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{});
     }
-    else if(data_type == GemmDataType::F8_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    if(data_type == GemmDataType::F8_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         return profile(
             F8{}, F8{}, F8{}, F32{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{}, Col{}, Row{});
     }
-    else if(data_type == GemmDataType::INT8_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+#endif // CK_ENABLE_FP8
+    if(data_type == GemmDataType::INT8_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         return profile(
             I8{}, I8{}, I8{}, I32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{});
     }
-    else if(data_type == GemmDataType::INT8_INT8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    if(data_type == GemmDataType::INT8_INT8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         return profile(
             I8{}, I8{}, I8{}, I32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Col{}, Row{});
     }
-    else
-    {
-        std::cout << "this data_type & layout is not implemented" << std::endl;
 
-        return 1;
-    }
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+    return 1;
 }
 
 REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_multiply_multiply);
diff --git a/profiler/src/profile_gemm_quantization.cpp b/profiler/src/profile_gemm_quantization.cpp
new file mode 100644
index 0000000000..d28dd60dce
--- /dev/null
+++ b/profiler/src/profile_gemm_quantization.cpp
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <cstdio>
+
+#include "profiler/profile_gemm_quantization_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_quantization"
+#define OP_DESC "GEMM Quantization"
+
+using INT8  = int8_t;
+using INT32 = int32_t;
+
+int profile_gemm_quantization(int argc, char* argv[])
+{
+    enum struct MatrixLayout
+    {
+        MK_KN_MN, // 0:
+        MK_NK_MN, // 1:
+        KM_KN_MN, // 2:
+        KM_NK_MN, // 3:
+    };
+
+    if(argc != 14)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: matrix layout (0: E[m, n] = A[m, k] * B[k, n];\n");
+        printf("                     1: E[m, n] = A[m, k] * B[n, k];\n");
+        printf("                     2: E[m, n] = A[k, m] * B[k, n];\n");
+        printf("                     3: E[m, n] = A[k, m] * B[n, k])\n");
+        printf("arg3: verification (0: no; 1: yes)\n");
+        printf("arg4: initialization (0: no init; default: integer value)\n");
+        printf("arg5: print tensor value (0: no; 1: yes)\n");
+        printf("arg6: time kernel (0=no, 1=yes)\n");
+        printf("arg7 to 12: M, N, K, StrideA, StrideB, StrideE\n");
+        printf("arg13: requant_scale (float, e.g., 0.03)\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[2]));
+    const bool do_verification = std::stoi(argv[3]);
+    const int init_method      = std::stoi(argv[4]);
+    const bool do_log          = std::stoi(argv[5]);
+    const bool time_kernel     = std::stoi(argv[6]);
+
+    const int M = std::stoi(argv[7]);
+    const int N = std::stoi(argv[8]);
+    const int K = std::stoi(argv[9]);
+
+    const int StrideA = std::stoi(argv[10]);
+    const int StrideB = std::stoi(argv[11]);
+    const int StrideE = std::stoi(argv[12]);
+
+    const float requant_scale = std::stof(argv[13]);
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_layout, auto b_layout, auto e_layout) {
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using ELayout = decltype(e_layout);
+
+        bool pass = ck::profiler::profile_gemm_quantization_impl<int8_t,
+                                                                 int8_t,
+                                                                 int32_t,
+                                                                 int8_t,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          StrideA,
+                                                                          StrideB,
+                                                                          StrideE,
+                                                                          requant_scale);
+
+        return pass ? 0 : 1;
+    };
+
+    if(layout == MatrixLayout::MK_KN_MN)
+    {
+        return profile(Row{}, Row{}, Row{});
+    }
+    else if(layout == MatrixLayout::MK_NK_MN)
+    {
+        return profile(Row{}, Col{}, Row{});
+    }
+    else if(layout == MatrixLayout::KM_KN_MN)
+    {
+        return profile(Col{}, Row{}, Row{});
+    }
+    else if(layout == MatrixLayout::KM_NK_MN)
+    {
+        return profile(Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this layout is not implemented" << std::endl;
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_quantization);
diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp
index 24028b1448..d35cd27651 100644
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -103,7 +103,8 @@ int profile_gemm_universal(int argc, char* argv[])
     using F32  = float;
     using F16  = ck::half_t;
     using BF16 = ck::bhalf_t;
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || CK_USE_OCP_FP8 || defined(CK_USE_GFX94) || \
+    defined(CK_USE_WMMA_FP8)
     using F8 = ck::f8_t;
     using I4 = ck::pk_i4_t;
 #endif
@@ -167,7 +168,8 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || CK_USE_OCP_FP8 || defined(CK_USE_GFX94) || \
+    defined(CK_USE_WMMA_FP8)
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
@@ -201,7 +203,8 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
     }
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || CK_USE_OCP_FP8 || defined(CK_USE_GFX94) || \
+    defined(CK_USE_WMMA_FP8)
     else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
diff --git a/profiler/src/profile_gemm_universal_preshuffle.cpp b/profiler/src/profile_gemm_universal_preshuffle.cpp
index bc09d7d35d..d8d8f29ac6 100644
--- a/profiler/src/profile_gemm_universal_preshuffle.cpp
+++ b/profiler/src/profile_gemm_universal_preshuffle.cpp
@@ -104,7 +104,8 @@ int profile_gemm_universal_preshuffle(int argc, char* argv[])
     using F32  = float;
     using F16  = ck::half_t;
     using BF16 = ck::bhalf_t;
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || CK_USE_OCP_FP8 || defined(CK_USE_GFX94) || \
+    defined(CK_USE_WMMA_FP8)
     using F8 = ck::f8_t;
 #endif
 
@@ -163,7 +164,8 @@ int profile_gemm_universal_preshuffle(int argc, char* argv[])
     {
         return profile(F8{}, F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || CK_USE_OCP_FP8 || defined(CK_USE_GFX94) || \
+    defined(CK_USE_WMMA_FP8)
     if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp
index a7714b4c73..a8d343405d 100644
--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -21,14 +21,15 @@ enum struct ConvLayout
 
 enum struct ConvDataType
 {
-    F32_F32_F32,    // 0
-    F16_F16_F16,    // 1
-    BF16_BF16_BF16, // 2
-    INT8_INT8_INT8, // 3
-    F8_F8_F8,       // 4
-    BF8_BF8_F8,     // 5
-    F8_BF8_F8,      // 6
-    BF8_F8_F8,      // 7
+    F32_F32_F32,      // 0
+    F16_F16_F16,      // 1
+    BF16_BF16_BF16,   // 2
+    INT8_INT8_INT8,   // 3
+    F8_F8_F8,         // 4
+    BF8_BF8_F8,       // 5
+    F8_BF8_F8,        // 6
+    BF8_F8_F8,        // 7
+    F32_F32_F32_TF32, // 8
 };
 
 enum struct IndexType
@@ -52,7 +53,8 @@ static void print_helper_msg()
         << "                 4: Input fp8, Weight fp8, Output fp8\n"
         << "                 5: Input bf8, Weight bf8, Output fp8\n"
         << "                 6: Input fp8, Weight bf8, Output fp8\n"
-        << "                 7: Input bf8, Weight fp8, Output fp8)\n"
+        << "                 7: Input bf8, Weight fp8, Output fp8\n"
+        << "                 8: Input fp32, Weight fp32, Output fp32, Compute tf32)\n"
         << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
         << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]\n"
         << "                     2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, "
@@ -103,6 +105,9 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
     using INT8 = int8_t;
     using F8   = ck::f8_t;
     using BF8  = ck::bf8_t;
+#if defined(__gfx942__)
+    using TF32 = ck::tf32_t;
+#endif
 
     //
     using GNWC   = ck::tensor_layout::convolution::GNWC;
@@ -261,6 +266,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
             return profile(
                 I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{});
         }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
     }
     // NHWGC_GKYXC_NHWGK
     else if(num_dim_spatial == 1 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
@@ -367,6 +378,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
         {
             return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF8{}, F8{}, F8{}, BF8{}, F8{});
         }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
     }
     // NGCDHW_GKCZYX_NGKDHW
     else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKCYX_NGKHW)
@@ -384,6 +401,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
             return profile(
                 I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
         }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
     }
 
     std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh
index ea2834ae62..74391ded28 100755
--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
@@ -1,7 +1,2 @@
-#!/bin/bash
-set -euo pipefail
-IFS=$'\n\t'
-
-
-find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
-git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|.hpp|.inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
+find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | grep -v 'build/' | grep -v 'include/rapidjson'| xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
+git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|.hpp|.inc|include/rapidjson/")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 217ec998bd..6220009b03 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -20,7 +20,7 @@ fi
 GPU_TARGETS="gfx908;gfx90a;gfx942"
 
 if [ $# -ge 1 ]; then
-    case "$1" in 
+    case "$1" in
         gfx*)
             GPU_TARGETS=$1
             shift 1
@@ -28,23 +28,17 @@ if [ $# -ge 1 ]; then
             REST_ARGS=("$@")
             ;;
         *)
-            echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
-            GPU_TARGETS="gfx908;gfx90a;gfx942"
-            shift 1
             REST_ARGS=("$@")
             ;;
     esac
 else
-    echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
-    GPU_TARGETS="gfx908;gfx90a;gfx942"
-    shift 1
     REST_ARGS=("$@")
 fi
 
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm/                                                                   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++                                                  \
--D CMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"                    \
+-D CMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker -fbracket-depth=512" \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \
diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py
index 9e2f436e68..d814e0719c 100644
--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
@@ -10,7 +10,7 @@ import subprocess
 
 
 def init_const_args(args):
-    args.ck_profiler_cmd = '../build/bin/ckProfiler'
+    args.ck_profiler_cmd = "../build/bin/ckProfiler"
     # use decimal values
     args.init_method = 2
     # don't print tensor values
@@ -27,52 +27,62 @@ def run_ck_profiler_cmd(cmd):
 
 
 def parse_layouts(args):
-    if args.in_layout == "NCW" or args.in_layout == "NCHW" or \
-       args.in_layout == "NCDHW":
+    if args.in_layout == "NCW" or args.in_layout == "NCHW" or args.in_layout == "NCDHW":
         if args.ck_profier_op == "grouped_conv_bwd_weight":
             args.layout = 4
-        elif args.ck_profier_op == "grouped_conv_fwd" or \
-             args.ck_profier_op == "grouped_conv_bwd_data":
+        elif (
+            args.ck_profier_op == "grouped_conv_fwd"
+            or args.ck_profier_op == "grouped_conv_bwd_data"
+        ):
             args.layout = 3
         else:
-            print('Not supported layout for this op')
+            print("Not supported layout for this op")
             exit(1)
-    elif args.in_layout == "NWC" or args.in_layout == "NHWC" or \
-       args.in_layout == "NDHWC":
+    elif (
+        args.in_layout == "NWC" or args.in_layout == "NHWC" or args.in_layout == "NDHWC"
+    ):
         if args.ck_profier_op == "grouped_conv_bwd_weight":
             args.layout = 2
-        elif args.ck_profier_op == "grouped_conv_bwd_data" or \
-                args.ck_profier_op == "grouped_conv_fwd":
+        elif (
+            args.ck_profier_op == "grouped_conv_bwd_data"
+            or args.ck_profier_op == "grouped_conv_fwd"
+        ):
             args.layout = 1
     else:
-        print('Not supported layout for this op')
+        print("Not supported layout for this op")
         exit(1)
 
 
 def parse_data_type(args):
     if args.data_type == "fp32":
-        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
-           args.ck_profier_op == "grouped_conv_bwd_data" or \
-           args.ck_profier_op == "grouped_conv_fwd":
+        if (
+            args.ck_profier_op == "grouped_conv_bwd_weight"
+            or args.ck_profier_op == "grouped_conv_bwd_data"
+            or args.ck_profier_op == "grouped_conv_fwd"
+        ):
             args.data_type = 0
     if args.data_type == "fp16":
-        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
-           args.ck_profier_op == "grouped_conv_bwd_data" or \
-           args.ck_profier_op == "grouped_conv_fwd":
+        if (
+            args.ck_profier_op == "grouped_conv_bwd_weight"
+            or args.ck_profier_op == "grouped_conv_bwd_data"
+            or args.ck_profier_op == "grouped_conv_fwd"
+        ):
             args.data_type = 1
     if args.data_type == "int8":
         if args.ck_profier_op == "grouped_conv_bwd_weight":
             args.data_type = 4
         if args.ck_profier_op == "grouped_conv_bwd_data":
-            print('Not supported data type for grouped_conv_bwd_data')
+            print("Not supported data type for grouped_conv_bwd_data")
             exit(1)
         if args.ck_profier_op == "grouped_conv_fwd":
             args.data_type = 3
     if args.data_type == "bfp16":
         if args.ck_profier_op == "grouped_conv_bwd_weight":
             args.data_type = 5
-        if args.ck_profier_op == "grouped_conv_bwd_data" or \
-           args.ck_profier_op == "grouped_conv_fwd":
+        if (
+            args.ck_profier_op == "grouped_conv_bwd_data"
+            or args.ck_profier_op == "grouped_conv_fwd"
+        ):
             args.data_type = 2
 
 
@@ -93,13 +103,11 @@ def add_conv_params_to_cmd(args, cmd):
         cmd += [str(args.in_d), str(args.in_h), str(args.in_w)]
         cmd += [str(args.conv_stride_d), str(args.conv_stride_h)]
         cmd += [str(args.conv_stride_w)]
-        cmd += [str(args.dilation_d),
-                str(args.dilation_h),
-                str(args.dilation_w)]
+        cmd += [str(args.dilation_d), str(args.dilation_h), str(args.dilation_w)]
         cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
         cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
     else:
-        print('Not supported spatial dim (supported: 1, 2, 3)')
+        print("Not supported spatial dim (supported: 1, 2, 3)")
         exit(1)
 
 
@@ -147,7 +155,7 @@ def run_ck_grouped_conv_bwd_weight(args):
     parse_data_type(args)
     parse_layouts(args)
     # Test all split K value from the list {1, 2, 4, 8, 32, 64, 128}
-    args.split_k_value = -1
+    args.split_k_value = "all"
 
     cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
     cmd += [str(args.data_type), str(args.layout)]
@@ -161,23 +169,23 @@ def run_ck_grouped_conv_bwd_weight(args):
     cmd += [str(args.split_k_value)]
     run_ck_profiler_cmd(cmd)
 
+
 # Get name of miopen driver, remove it from unknown
 def process_miopen_driver_name(args, unknown):
     if "convint8" in unknown:
-        args.data_type = 'int8'
+        args.data_type = "int8"
         unknown.remove("convint8")
     elif "convbfp16" in unknown:
-        args.data_type = 'bfp16'
+        args.data_type = "bfp16"
         unknown.remove("convbfp16")
     elif "convfp16" in unknown:
-        args.data_type = 'fp16'
+        args.data_type = "fp16"
         unknown.remove("convfp16")
     elif "conv" in unknown:
-        args.data_type = 'fp32'
+        args.data_type = "fp32"
         unknown.remove("conv")
     else:
-        print('Not supported driver (supported: conv, convfp16, convint8,'
-              ' convbfp16).')
+        print("Not supported driver (supported: conv, convfp16, convint8, convbfp16).")
         exit(1)
 
 
@@ -199,11 +207,11 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="converter",
         description="Convert miopen driver command to ck Profiler"
-                    "\nExample: python3 "
-                    "../script/convert_miopen_driver_to_profiler.py "
-                    "/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 "
-                    "-k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g "
-                    "32 -F 1 -t 1",
+        "\nExample: python3 "
+        "../script/convert_miopen_driver_to_profiler.py "
+        "/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 "
+        "-k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g "
+        "32 -F 1 -t 1",
     )
     parser.add_argument(
         "-in_layout",
@@ -213,7 +221,7 @@ if __name__ == "__main__":
         default="NCHW",
         type=str,
         required=False,
-        help="Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)"
+        help="Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)",
     )
     parser.add_argument(
         "-forw",
@@ -230,7 +238,7 @@ if __name__ == "__main__":
         "\n4 wrw only"
         "\n3 fwd+bwd"
         "\n5 fwd+wrw"
-        "\n6 bwd+wrw"
+        "\n6 bwd+wrw",
     )
     parser.add_argument(
         "-spatial_dim",
@@ -240,7 +248,7 @@ if __name__ == "__main__":
         default=2,
         type=int,
         required=False,
-        help="convolution spatial dimension (Default-2)"
+        help="convolution spatial dimension (Default-2)",
     )
     parser.add_argument(
         "-batchsize",
@@ -250,7 +258,7 @@ if __name__ == "__main__":
         default=100,
         type=int,
         required=False,
-        help="Mini-batch size (Default=100)"
+        help="Mini-batch size (Default=100)",
     )
     parser.add_argument(
         "-in_channels",
@@ -260,7 +268,7 @@ if __name__ == "__main__":
         default=3,
         type=int,
         required=False,
-        help="Number of Input Channels (Default=3)"
+        help="Number of Input Channels (Default=3)",
     )
     parser.add_argument(
         "-in_d",
@@ -270,7 +278,7 @@ if __name__ == "__main__":
         default=32,
         type=int,
         required=False,
-        help="Input Depth (Default=32)"
+        help="Input Depth (Default=32)",
     )
     parser.add_argument(
         "-in_h",
@@ -280,7 +288,7 @@ if __name__ == "__main__":
         default=32,
         type=int,
         required=False,
-        help="Input Height (Default=32)"
+        help="Input Height (Default=32)",
     )
     parser.add_argument(
         "-in_w",
@@ -290,7 +298,7 @@ if __name__ == "__main__":
         default=32,
         type=int,
         required=False,
-        help="Input Width (Default=32)"
+        help="Input Width (Default=32)",
     )
     parser.add_argument(
         "-out_channels",
@@ -300,7 +308,7 @@ if __name__ == "__main__":
         default=32,
         type=int,
         required=False,
-        help="Number of Output Channels (Default=32)"
+        help="Number of Output Channels (Default=32)",
     )
     parser.add_argument(
         "-fil_d",
@@ -310,7 +318,7 @@ if __name__ == "__main__":
         default=3,
         type=int,
         required=False,
-        help="Filter Depth (Default=3)"
+        help="Filter Depth (Default=3)",
     )
     parser.add_argument(
         "-fil_h",
@@ -320,7 +328,7 @@ if __name__ == "__main__":
         default=3,
         type=int,
         required=False,
-        help="Filter Height (Default=3)"
+        help="Filter Height (Default=3)",
     )
     parser.add_argument(
         "-fil_w",
@@ -330,7 +338,7 @@ if __name__ == "__main__":
         default=3,
         type=int,
         required=False,
-        help="Filter Width (Default=3)"
+        help="Filter Width (Default=3)",
     )
     parser.add_argument(
         "-conv_stride_d",
@@ -340,7 +348,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Convolution Stride for Depth (Default=1)"
+        help="Convolution Stride for Depth (Default=1)",
     )
     parser.add_argument(
         "-conv_stride_h",
@@ -350,7 +358,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Convolution Stride for Height (Default=1)"
+        help="Convolution Stride for Height (Default=1)",
     )
     parser.add_argument(
         "-conv_stride_w",
@@ -360,7 +368,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Convolution Stride for Width (Default=1)"
+        help="Convolution Stride for Width (Default=1)",
     )
     parser.add_argument(
         "-pad_d",
@@ -370,7 +378,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Zero Padding for Depth (Default=0)"
+        help="Zero Padding for Depth (Default=0)",
     )
     parser.add_argument(
         "-pad_h",
@@ -380,7 +388,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Zero Padding for Height (Default=0)"
+        help="Zero Padding for Height (Default=0)",
     )
     parser.add_argument(
         "-pad_w",
@@ -390,7 +398,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Zero Padding for Width (Default=0)"
+        help="Zero Padding for Width (Default=0)",
     )
     parser.add_argument(
         "-verify",
@@ -400,7 +408,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Verify Each Layer (Default=1)"
+        help="Verify Each Layer (Default=1)",
     )
     parser.add_argument(
         "-time",
@@ -410,7 +418,7 @@ if __name__ == "__main__":
         default=0,
         type=int,
         required=False,
-        help="Time Each Layer (Default=0)"
+        help="Time Each Layer (Default=0)",
     )
     parser.add_argument(
         "-dilation_d",
@@ -420,7 +428,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Dilation of Filter Depth (Default=1)"
+        help="Dilation of Filter Depth (Default=1)",
     )
     parser.add_argument(
         "-dilation_h",
@@ -430,7 +438,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Dilation of Filter Height (Default=1)"
+        help="Dilation of Filter Height (Default=1)",
     )
     parser.add_argument(
         "-dilation_w",
@@ -440,7 +448,7 @@ if __name__ == "__main__":
         default=1,
         type=int,
         required=False,
-        help="Dilation of Filter Width (Default=1)"
+        help="Dilation of Filter Width (Default=1)",
     )
     parser.add_argument(
         "-group_count",
@@ -450,7 +458,7 @@ if __name__ == "__main__":
         type=int,
         default=1,
         required=False,
-        help="Number of Groups (Default=1)"
+        help="Number of Groups (Default=1)",
     )
 
     args, unknown = parser.parse_known_args()
diff --git a/script/dependency-parser/main.py b/script/dependency-parser/main.py
index b8fd67ac49..5c956bca00 100644
--- a/script/dependency-parser/main.py
+++ b/script/dependency-parser/main.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Unified CLI for Ninja Dependency Analysis and Selective Testing
 
diff --git a/script/dependency-parser/src/enhanced_ninja_parser.py b/script/dependency-parser/src/enhanced_ninja_parser.py
index 087ab50640..725768a61f 100644
--- a/script/dependency-parser/src/enhanced_ninja_parser.py
+++ b/script/dependency-parser/src/enhanced_ninja_parser.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Enhanced Ninja Dependency Parser
 
diff --git a/script/dependency-parser/src/selective_test_filter.py b/script/dependency-parser/src/selective_test_filter.py
index f364d60d27..e8698d115d 100644
--- a/script/dependency-parser/src/selective_test_filter.py
+++ b/script/dependency-parser/src/selective_test_filter.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Selective Test Filter Tool
 
diff --git a/script/gemm_profile.sh b/script/gemm_profile.sh
index b71c43f74f..89419ca711 100755
--- a/script/gemm_profile.sh
+++ b/script/gemm_profile.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 BIN=./bin/tile_example_gemm_weight_preshuffle
 PREC=fp8
@@ -36,8 +38,13 @@ ARGS_LIST=(
   "14 5120 1024"
   "15 2048 5120"
   "15 5120 1024"
+  "16 64 128"
+  "16 64 256"
   "16 2048 5120"
   "16 5120 1024"
+  "512 768 640"
+  "1024 1792 896"
+  "1536 2816 1152"
   "2048 5120 1024"
   "2048 5120 8192"
   "2048 7168 8192"
@@ -68,8 +75,8 @@ for args in "${ARGS_LIST[@]}"; do
   PERF_LINE=$(echo "$OUTPUT" | grep "TFlops")
 
   # Extract verification result
-  # Format: "The GPU verification result is: correct"
-  VERIFICATION=$(echo "$OUTPUT" | grep "The GPU verification result is:" | sed -n 's/.*The GPU verification result is: \(.*\)/\1/p')
+  # Format: "The GPU verification result is:correct" (note: no space after colon)
+  VERIFICATION=$(echo "$OUTPUT" | grep "The GPU verification result is:" | sed -n 's/.*The GPU verification result is:\(.*\)/\1/p')
 
   if [ -n "$PERF_LINE" ]; then
     # Extract execution time in ms
@@ -89,6 +96,7 @@ for args in "${ARGS_LIST[@]}"; do
     echo "  Time: ${TIME_MS} ms"
     echo "  TFlops: ${TFLOPS}"
     echo "  GB/s: ${GBPS}"
+    echo "  Verification: ${VERIFICATION:-N/A}"
 
     
     # Save to CSV file
diff --git a/script/launch_tests.sh b/script/launch_tests.sh
index 829ac82378..17a99e62a3 100755
--- a/script/launch_tests.sh
+++ b/script/launch_tests.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 # Get the directory where the script is located
 BUILD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -47,7 +49,7 @@ with open('$TEST_FILE', 'r') as f:
     if tests:
         # Extract just the filename after the last '/'
         clean_tests = [os.path.basename(test) for test in tests]
-        print('ctest -R \"' + '|'.join(clean_tests) + '\"')
+        print('ctest --output-on-failure -R \"' + '|'.join(clean_tests) + '\"')
     else:
         print('# No tests to run')
 ")
@@ -55,5 +57,3 @@ with open('$TEST_FILE', 'r') as f:
 echo "$command"
 
 eval "$command"
-
-
diff --git a/script/ninja_json_converter.py b/script/ninja_json_converter.py
index 92660dc7b3..7bfb2f867b 100644
--- a/script/ninja_json_converter.py
+++ b/script/ninja_json_converter.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 """
 Converts .ninja_log files into Chrome's about:tracing format.
diff --git a/script/remod_for_ck_tile.sh b/script/remod_for_ck_tile.sh
index 5c7a78d0cc..b017d2e1d6 100755
--- a/script/remod_for_ck_tile.sh
+++ b/script/remod_for_ck_tile.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 # Get list of staged files
 STAGED_FILES=$(git diff --cached --name-only)
diff --git a/script/remove_exec_bit.sh b/script/remove_exec_bit.sh
index 25466d8c37..5b0035c8b8 100755
--- a/script/remove_exec_bit.sh
+++ b/script/remove_exec_bit.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 for file in $(git diff --cached --name-only --diff-filter=ACM | grep -E '\.(cpp|hpp|txt|inc)$'); do
     if [ -x "$file" ]; then
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a2196ad2b2..292bc41a0b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -38,6 +38,13 @@ set(REGRESSION_TESTS
     test_conv_tensor_rearrange
     test_gemm_mx
     test_ck_tile_batched_transpose
+    test_ck_tile_fmha_bwd_fp32
+    test_ck_tile_fmha_bwd_bf16
+    test_ck_tile_fmha_bwd_fp16
+    test_ck_tile_fmha_fwd_fp32
+    test_ck_tile_fmha_fwd_bf16
+    test_ck_tile_fmha_fwd_fp16
+    test_ck_tile_fmha_fwd_fp8
 )
 
 function(add_test_executable TEST_NAME)
@@ -89,7 +96,7 @@ function(add_test_executable TEST_NAME)
         endif()
     endforeach()
     foreach(source IN LISTS ARGN)
-        if(NOT TEST_TARGETS MATCHES "gfx9" AND source MATCHES "xdl")
+        if(NOT TEST_TARGETS MATCHES "gfx9" AND NOT TEST_TARGETS MATCHES "gfx11" AND NOT TEST_TARGETS MATCHES "gfx12" AND source MATCHES "xdl")
             message(DEBUG "removing xdl test ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
@@ -103,7 +110,7 @@ function(add_test_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx10-3-generic)
         elseif(ARGN MATCHES "_wmma")
              list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(ARGN MATCHES "_smfmac")
@@ -174,7 +181,7 @@ function(add_gtest_executable TEST_NAME)
     endforeach()
 
     foreach(source IN LISTS ARGN)
-        if(NOT TEST_TARGETS MATCHES "gfx9" AND source MATCHES "xdl")
+        if(NOT TEST_TARGETS MATCHES "gfx9" AND NOT TEST_TARGETS MATCHES "gfx1[12]" AND source MATCHES "xdl")
             message(DEBUG "removing xdl test ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
@@ -197,7 +204,7 @@ function(add_gtest_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx10-3-generic)
         elseif(ARGN MATCHES "_wmma")
              list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(ARGN MATCHES "_smfmac")
@@ -238,11 +245,13 @@ add_subdirectory(reference_conv_fwd)
 add_subdirectory(gemm)
 add_subdirectory(gemm_add)
 add_subdirectory(gemm_layernorm)
+add_subdirectory(gemm_multi_abd)
 add_subdirectory(gemm_split_k)
 add_subdirectory(gemm_universal)
 add_subdirectory(gemm_b_scale)
 add_subdirectory(gemm_universal_streamk)
 add_subdirectory(gemm_reduce)
+add_subdirectory(gemm_universal_reduce)
 add_subdirectory(batched_gemm)
 add_subdirectory(batched_gemm_reduce)
 add_subdirectory(batched_gemm_gemm)
@@ -271,6 +280,7 @@ add_subdirectory(conv_tensor_rearrange)
 add_subdirectory(transpose)
 add_subdirectory(permute_scale)
 add_subdirectory(wrapper)
+add_subdirectory(quantization)
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx11")
     add_subdirectory(wmma_op)
 endif()
diff --git a/test/batched_gemm/test_batched_gemm_wmma.cpp b/test/batched_gemm/test_batched_gemm_wmma.cpp
index 18f9db8c39..fc190bed85 100644
--- a/test/batched_gemm/test_batched_gemm_wmma.cpp
+++ b/test/batched_gemm/test_batched_gemm_wmma.cpp
@@ -12,7 +12,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
-
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 struct GemmParams
 {
     ck::index_t M;
@@ -37,96 +38,153 @@ class TestBatchedGemm : public ::testing::Test
         using namespace ck::tensor_operation::device;
 
         bool pass = true;
-        for(auto& param : params)
+        for(size_t i = 0; i < params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param           = params[i];
             const auto M          = param.M;
             const auto N          = param.N;
             const auto K          = param.K;
             const auto BatchCount = param.BatchCount;
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -191,3 +249,20 @@ TEST_F(TestBatchedGemm, fp16)
 //     this->template Run<float>();
 // }
 // #endif
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm/test_batched_gemm_xdl.cpp b/test/batched_gemm/test_batched_gemm_xdl.cpp
index f9bb626ce5..3b7c392004 100644
--- a/test/batched_gemm/test_batched_gemm_xdl.cpp
+++ b/test/batched_gemm/test_batched_gemm_xdl.cpp
@@ -13,6 +13,9 @@
 
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 struct GemmParams
 {
     ck::index_t M;
@@ -37,96 +40,153 @@ class TestBatchedGemm : public ::testing::Test
         using namespace ck::tensor_operation::device;
 
         bool pass = true;
-        for(auto& param : params)
+        for(size_t i = 0; i < params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param           = params[i];
             const auto M          = param.M;
             const auto N          = param.N;
             const auto K          = param.K;
             const auto BatchCount = param.BatchCount;
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -183,3 +243,20 @@ TEST_F(TestBatchedGemm, fp32)
     this->template Run<float>();
 }
 #endif
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_gemm/CMakeLists.txt b/test/batched_gemm_gemm/CMakeLists.txt
index 2b3288ef9d..70d7420992 100644
--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
@@ -1,6 +1,14 @@
-add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16_xdl.cpp)
+add_gtest_executable(test_batched_gemm_gemm_fp16_xdl test_batched_gemm_gemm_fp16_xdl.cpp)
 if(result EQUAL 0)
-  add_custom_target(test_batched_gemm_gemm)
-  target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
-  add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
+  target_link_libraries(test_batched_gemm_gemm_fp16_xdl PRIVATE utility device_batched_gemm_gemm_instance)
+endif()
+
+add_gtest_executable(test_batched_gemm_gemm_bf16_wmma test_batched_gemm_gemm_bf16_wmma_cshuffle_v3.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_batched_gemm_gemm_bf16_wmma PRIVATE utility device_batched_gemm_gemm_instance)
+endif()
+
+add_gtest_executable(test_batched_gemm_gemm_fp16_wmma test_batched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_batched_gemm_gemm_fp16_wmma PRIVATE utility device_batched_gemm_gemm_instance)
 endif()
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_bf16_wmma_cshuffle_v3.cpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_bf16_wmma_cshuffle_v3.cpp
new file mode 100644
index 0000000000..41c375a624
--- /dev/null
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_bf16_wmma_cshuffle_v3.cpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_gemm_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmGemmBF16 : public TestBatchedGemmGemm<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<BF16, BF16, BF16, BF16, Row, Col, Row, Row>,
+    std::tuple<BF16, BF16, BF16, BF16, Row, Col, Col, Row>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmGemmBF16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmGemmBF16, Test_BF16)
+{
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmBF16, Test_BF16_PadM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {136, 128, 32, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmBF16, Test_BF16_PadN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 136, 32, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmBF16, Test_BF16_PadK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 40, 128, 1},
+        {128, 128, 136, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmBF16, Test_BF16_PadO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 136, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmBF16, Test_BF16_OddM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {129, 128, 32, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmBF16, Test_BF16_OddN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 129, 32, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmBF16, Test_BF16_OddK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 33, 128, 1},
+        {128, 128, 129, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+// If kernel B1Layout is RowMajor, expect not to support odd O size
+TYPED_TEST(TestBatchedGemmGemmBF16, Test_BF16_OddO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 129, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmBF16, DISABLED_Bench_BF16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 768},
+        {256, 256, 128, 128, 768},
+        {512, 512, 64, 64, 768},
+        {512, 512, 128, 128, 768},
+        {1024, 1024, 64, 64, 768},
+        {1024, 1024, 128, 128, 768},
+        {2048, 2048, 64, 64, 768},
+        {2048, 2048, 128, 128, 768},
+        {4096, 4096, 64, 64, 768},
+        {4096, 4096, 128, 128, 768},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp
new file mode 100644
index 0000000000..4dd55429b4
--- /dev/null
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_gemm_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmGemmFP16 : public TestBatchedGemmGemm<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<F16, F16, F16, F16, Row, Col, Row, Row>,
+    std::tuple<F16, F16, F16, F16, Row, Col, Col, Row>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmGemmFP16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16)
+{
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {136, 128, 32, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 136, 32, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 40, 128, 1},
+        {128, 128, 136, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 136, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {129, 128, 32, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 129, 32, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 33, 128, 1},
+        {128, 128, 129, 128, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+// If kernel B1Layout is RowMajor, expect not to support odd O size
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 129, 1},
+    };
+    this->bench_  = true;
+    this->verify_ = true;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, DISABLED_Bench_FP16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 768},
+        {256, 256, 128, 128, 768},
+        {512, 512, 64, 64, 768},
+        {512, 512, 128, 128, 768},
+        {1024, 1024, 64, 64, 768},
+        {1024, 1024, 128, 128, 768},
+        {2048, 2048, 64, 64, 768},
+        {2048, 2048, 128, 128, 768},
+        {4096, 4096, 64, 64, 768},
+        {4096, 4096, 128, 128, 768},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_xdl.cpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_xdl.cpp
index 1a8d5c2e55..1fe7e12251 100644
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_xdl.cpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_xdl.cpp
@@ -1,8 +1,126 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_gemm_util.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+
+template <GemmSpecialization GemmSpec>
+struct DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using ALayout  = Row;
+    using B0Layout = Col;
+    using B1Layout = Row;
+    using CLayout  = Row;
+
+    using ADataType        = F16;
+    using B0DataType       = F16;
+    using B1DataType       = F16;
+    using AccDataType      = float;
+    using CShuffleDataType = float;
+    using CDataType        = F16;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = PassThrough;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+
+    // static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
+
+    using DeviceGemmGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        ALayout,
+        B0Layout,
+        B1Layout,
+        CLayout,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        16,          // MPerXDL
+        16,          // NPerXDL
+        2,           // MXdlPerWave
+        8,           // NXdlPerWave
+        8,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<8, 32, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        4>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+    bool IsSupported(int M, int N, int K, int O)
+    {
+        auto gemm     = DeviceGemmGemmInstance{};
+        auto invoker  = gemm.MakeInvoker();
+        auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
+                                          static_cast<B0DataType*>(nullptr),
+                                          static_cast<B1DataType*>(nullptr),
+                                          static_cast<CDataType*>(nullptr),
+                                          M,
+                                          N,
+                                          K,
+                                          O,
+                                          0,              // BatchCount
+                                          0,              // StrideA
+                                          0,              // StrideB0
+                                          0,              // StrideB1
+                                          0,              // StrideC
+                                          0,              // BatchStrideA
+                                          0,              // BatchStrideB0
+                                          0,              // BatchStrideB1
+                                          0,              // BatchStrideC
+                                          PassThrough{},  // a_element_op
+                                          PassThrough{},  // b0_element_op
+                                          PassThrough{},  // acc0_element_op
+                                          PassThrough{},  // b1_element_op
+                                          PassThrough{}); // c_element_op
+
+        return gemm.IsSupportedArgument(argument);
+    }
+};
 
 template <typename Tuple>
 class TestBatchedGemmGemmFP16 : public TestBatchedGemmGemm<Tuple>
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
index b0fffc466e..b538f9fb0a 100644
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
@@ -1,11 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
 #include <vector>
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
 #include "profiler/profile_batched_gemm_gemm_impl.hpp"
 
 using ck::tensor_operation::device::GemmSpecialization;
@@ -13,7 +12,8 @@ using ck::tensor_operation::device::GemmSpecialization;
 template <ck::index_t N>
 using I = ck::Number<N>;
 
-using F16 = ck::half_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -70,120 +70,3 @@ struct TestBatchedGemmGemm : public ::testing::Test
         }
     }
 };
-
-template <GemmSpecialization GemmSpec>
-struct DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128
-{
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-    using ALayout  = Row;
-    using B0Layout = Col;
-    using B1Layout = Row;
-    using CLayout  = Row;
-
-    using ADataType        = F16;
-    using B0DataType       = F16;
-    using B1DataType       = F16;
-    using AccDataType      = float;
-    using CShuffleDataType = float;
-    using CDataType        = F16;
-
-    using AElementOp    = PassThrough;
-    using B0ElementOp   = PassThrough;
-    using Acc0ElementOp = PassThrough;
-    using B1ElementOp   = PassThrough;
-    using CElementOp    = PassThrough;
-
-    template <ck::index_t... Is>
-    using S = ck::Sequence<Is...>;
-
-    // static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
-
-    using DeviceGemmGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
-        ALayout,
-        B0Layout,
-        B1Layout,
-        CLayout,
-        ADataType,
-        B0DataType,
-        B1DataType,
-        CDataType,
-        AccDataType,
-        CShuffleDataType,
-        AElementOp,
-        B0ElementOp,
-        Acc0ElementOp,
-        B1ElementOp,
-        CElementOp,
-        GemmSpec,
-        1,
-        256,
-        128,         // MPerBlock
-        128,         // NPerBlock
-        32,          // KPerBlock
-        128,         // Gemm1NPerBlock
-        32,          // Gemm1KPerBlock
-        8,           // AK1
-        8,           // BK1
-        2,           // B1K1
-        32,          // MPerXDL
-        32,          // NPerXDL
-        1,           // MXdlPerWave
-        4,           // NXdlPerWave
-        4,           // Gemm1NXdlPerWave
-        S<4, 64, 1>, // ABlockTransfer
-        S<1, 0, 2>,
-        S<1, 0, 2>,
-        2,
-        8,
-        8,
-        true,
-        S<4, 64, 1>, // BBlockTransfer
-        S<1, 0, 2>,
-        S<1, 0, 2>,
-        2,
-        8,
-        8,
-        true,
-        S<8, 32, 1>, // B1BlockTransfer
-        S<0, 2, 1>,
-        S<0, 2, 1>,
-        1,
-        4,
-        2,
-        false,
-        1,              // CShuffleMXdlPerWavePerShuffle
-        2,              // CShuffleNXdlPerWavePerShuffle
-        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
-
-    bool IsSupported(int M, int N, int K, int O)
-    {
-        auto gemm     = DeviceGemmGemmInstance{};
-        auto invoker  = gemm.MakeInvoker();
-        auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
-                                          static_cast<B0DataType*>(nullptr),
-                                          static_cast<B1DataType*>(nullptr),
-                                          static_cast<CDataType*>(nullptr),
-                                          M,
-                                          N,
-                                          K,
-                                          O,
-                                          0,              // BatchCount
-                                          0,              // StrideA
-                                          0,              // StrideB0
-                                          0,              // StrideB1
-                                          0,              // StrideC
-                                          0,              // BatchStrideA
-                                          0,              // BatchStrideB0
-                                          0,              // BatchStrideB1
-                                          0,              // BatchStrideC
-                                          PassThrough{},  // a_element_op
-                                          PassThrough{},  // b0_element_op
-                                          PassThrough{},  // acc0_element_op
-                                          PassThrough{},  // b1_element_op
-                                          PassThrough{}); // c_element_op
-
-        return gemm.IsSupportedArgument(argument);
-    }
-};
diff --git a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp
index 6c04086e0e..5e250bc356 100644
--- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp
+++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp
@@ -7,6 +7,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
 
+static ck::index_t instance_index = -1;
+
 namespace {
 using F16 = ck::half_t;
 
@@ -56,7 +58,22 @@ class TestBatchedGemmMultiD : public ::testing::Test
                                                                             PassThrough,
                                                                             PassThrough,
                                                                             PassThrough>>(
-                true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+                true,  // do_verification
+                1,     // init_method
+                false, // do_log
+                1,     // time_kernel,
+                M,
+                N,
+                K,
+                std::is_same_v<ALayout, Row> ? K : M, // strideA
+                std::is_same_v<BLayout, Row> ? N : K, // strideB
+                std::is_same_v<CLayout, Row> ? N : M, // strideC
+                // BatchStrideA BatchStrideB, BatchStrideC
+                M * K,
+                K * N,
+                M * N,
+                BatchCount,
+                instance_index);
         EXPECT_TRUE(pass);
     }
 };
@@ -74,3 +91,18 @@ TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
 #ifdef CK_ENABLE_INT8
 TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
 #endif
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 2)
+    {
+        instance_index = atoi(argv[1]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1: instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp
index cb46a995c6..1ab29f251a 100644
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp
@@ -4,6 +4,9 @@
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchedGemmSoftmaxGemmFP16 : public TestBatchedGemmSoftmaxGemm<Tuple>
 {
@@ -174,3 +177,20 @@ TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
index d8ee744c60..8074d8a311 100644
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
@@ -9,6 +9,9 @@
 #include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
+
 template <ck::index_t N>
 using I = ck::Number<N>;
 
@@ -57,15 +60,38 @@ struct TestBatchedGemmSoftmaxGemm : public ::testing::Test
                                                                          B1Layout,
                                                                          CLayout,
                                                                          MaskingType::value>(
-            verify_, 1, false, bench_, M, N, K, O, BatchCount);
+            verify_,
+            1,
+            false,
+            bench_,
+            M,
+            N,
+            K,
+            O,
+            BatchCount,
+            -1, //  StrideA
+            -1, // StrideB0
+            -1, // StrideB1
+            -1, // StrideC
+            -1, // BatchStrideA
+            -1, //  BatchStrideB0
+            -1, // BatchStrideB1
+            -1, // BatchStrideC
+            -1, // alpha
+            instance_index);
 
         EXPECT_TRUE(pass);
     }
 
     void Run()
     {
-        for(auto lengths : this->lengths_)
+        for(size_t i = 0; i < this->lengths_.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& lengths  = this->lengths_[i];
             int M          = lengths[0];
             int N          = lengths[1];
             int K          = lengths[2];
@@ -133,11 +159,11 @@ struct DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128
             8,           // AK1
             8,           // BK1
             2,           // B1K1
-            32,          // MPerXDL
-            32,          // NPerXDL
-            1,           // MXdlPerWave
-            4,           // NXdlPerWave
-            4,           // Gemm1NXdlPerWave
+            16,          // MPerXDL
+            16,          // NPerXDL
+            2,           // MXdlPerWave
+            8,           // NXdlPerWave
+            8,           // Gemm1NXdlPerWave
             S<4, 64, 1>, // ABlockTransfer
             S<1, 0, 2>,
             S<1, 0, 2>,
@@ -162,7 +188,7 @@ struct DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128
             1,              // CShuffleMXdlPerWavePerShuffle
             2,              // CShuffleNXdlPerWavePerShuffle
             S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
             false>;
 
     bool IsSupported(int M, int N, int K, int O)
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp
index ef88ce6d81..9ce603c575 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp
@@ -4,6 +4,8 @@
 #include "gtest/gtest.h"
 #include "test_batched_gemm_bias_softmax_gemm_permute_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
     : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -180,3 +182,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp
index b38b10d195..40ce64837d 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp
@@ -4,6 +4,8 @@
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16
     : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -180,3 +182,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
index 1464eacfa5..e37cadd0c5 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
@@ -10,7 +10,8 @@
 #include "profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp"
 
 #include <hip/hip_runtime.h>
-
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;
 using ck::tensor_operation::device::TensorSpecialization;
@@ -66,21 +67,26 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
                                                                               Acc0BiasDataType,
                                                                               Acc1BiasDataType,
                                                                               MaskingType::value>(
-                verify_, 2, false, bench_, M, N, K, O, G0, G1);
+                verify_, 2, false, bench_, M, N, K, O, G0, G1, -1, instance_index);
 
         EXPECT_TRUE(pass);
     }
 
     void Run()
     {
-        for(auto lengths : this->lengths_)
+        for(size_t i = 0; i < this->lengths_.size(); i++)
         {
-            int M  = lengths[0];
-            int N  = lengths[1];
-            int K  = lengths[2];
-            int O  = lengths[3];
-            int G0 = lengths[4];
-            int G1 = lengths[5];
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& lengths = this->lengths_[i];
+            int M         = lengths[0];
+            int N         = lengths[1];
+            int K         = lengths[2];
+            int O         = lengths[3];
+            int G0        = lengths[4];
+            int G1        = lengths[5];
 
             this->RunSingle(M, N, K, O, G0, G1);
         }
@@ -293,11 +299,11 @@ struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128
             8,           // AK1
             8,           // BK1
             2,           // B1K1
-            32,          // MPerXDL
-            32,          // NPerXDL
-            1,           // MXdlPerWave
-            4,           // NXdlPerWave
-            4,           // Gemm1NXdlPerWave
+            16,          // MPerXDL
+            16,          // NPerXDL
+            2,           // MXdlPerWave
+            8,           // NXdlPerWave
+            8,           // Gemm1NXdlPerWave
             S<4, 64, 1>, // ABlockTransfer
             S<1, 0, 2>,
             S<1, 0, 2>,
@@ -322,7 +328,7 @@ struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128
             1,              // CShuffleMXdlPerWavePerShuffle
             2,              // CShuffleNXdlPerWavePerShuffle
             S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
             MaskingSpecialization::MaskOutUpperTriangle>; // MaskOutUpperTriangle
 
     bool IsSupported(int M, int N, int K, int O)
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
index 8d894576c4..b75b7e43cf 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
@@ -5,6 +5,8 @@
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
 #include "test_batched_gemm_device_utils.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
     : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -228,3 +230,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
index 3a86736f44..61baa50cd7 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
@@ -5,6 +5,9 @@
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
 #include "test_batched_gemm_device_utils.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16
     : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -191,3 +194,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
index 9df03ffd2a..13d2e0f0a2 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
@@ -9,6 +9,8 @@
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;
 using ck::tensor_operation::device::TensorSpecialization;
@@ -64,21 +66,26 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
                                                                          ck::Tuple<>,
                                                                          ck::Tuple<>,
                                                                          MaskingType::value>(
-                verify_, 2, false, bench_, M, N, K, O, G0, G1);
+                verify_, 2, false, bench_, M, N, K, O, G0, G1, -1, instance_index);
 
         EXPECT_TRUE(pass);
     }
 
     void Run()
     {
-        for(auto lengths : this->lengths_)
+        for(size_t i = 0; i < this->lengths_.size(); i++)
         {
-            int M  = lengths[0];
-            int N  = lengths[1];
-            int K  = lengths[2];
-            int O  = lengths[3];
-            int G0 = lengths[4];
-            int G1 = lengths[5];
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& lengths = this->lengths_[i];
+            int M         = lengths[0];
+            int N         = lengths[1];
+            int K         = lengths[2];
+            int O         = lengths[3];
+            int G0        = lengths[4];
+            int G1        = lengths[5];
 
             this->RunSingle(M, N, K, O, G0, G1);
         }
@@ -144,11 +151,11 @@ struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128
             8,           // AK1
             8,           // BK1
             2,           // B1K1
-            32,          // MPerXDL
-            32,          // NPerXDL
-            1,           // MXdlPerWave
-            4,           // NXdlPerWave
-            4,           // Gemm1NXdlPerWave
+            16,          // MPerXDL
+            16,          // NPerXDL
+            2,           // MXdlPerWave
+            8,           // NXdlPerWave
+            8,           // Gemm1NXdlPerWave
             S<4, 64, 1>, // ABlockTransfer
             S<1, 0, 2>,
             S<1, 0, 2>,
@@ -173,7 +180,7 @@ struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128
             1,              // CShuffleMXdlPerWavePerShuffle
             2,              // CShuffleNXdlPerWavePerShuffle
             S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            4,              // CShuffleBlockTransferScalarPerVector_NPerBlock
             MaskingSpecialization::MaskOutUpperTriangle>; // MaskOutUpperTriangle
 
     bool IsSupported(int M, int N, int K, int O)
diff --git a/test/batchnorm/batchnorm_bwd_rank_4.cpp b/test/batchnorm/batchnorm_bwd_rank_4.cpp
index cc514261e6..66908360d1 100644
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -15,6 +15,9 @@ using F32  = float;
 using BF16 = ck::bhalf_t;
 using F64  = double;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchNormBwdRank4 : public ::testing::Test
 {
@@ -37,33 +40,48 @@ class TestBatchNormBwdRank4 : public ::testing::Test
     template <int NumReduceDim>
     void Run()
     {
-        for(auto& inOutLengths : list_of_lengths)
+        for(size_t i = 0; i < list_of_lengths.size(); i++)
         {
-            bool pass = true;
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& inOutLengths = list_of_lengths[i];
+            bool pass          = true;
 
             EXPECT_FALSE(reduceDims.size() != NumReduceDim);
 
-            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
-                                                                         DxDataType,
-                                                                         DyDataType,
-                                                                         AccDataType,
-                                                                         ScaleDataType,
-                                                                         BiasDataType,
-                                                                         MeanVarDataType,
-                                                                         4,
-                                                                         NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, true, epsilon);
+            pass =
+                pass &&
+                ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                              DxDataType,
+                                                              DyDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              4,
+                                                              NumReduceDim>(
+                    true, 3, false, false, inOutLengths, reduceDims, true, epsilon, instance_index);
 
-            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
-                                                                         DxDataType,
-                                                                         DyDataType,
-                                                                         AccDataType,
-                                                                         ScaleDataType,
-                                                                         BiasDataType,
-                                                                         MeanVarDataType,
-                                                                         4,
-                                                                         NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, false, epsilon);
+            pass =
+                pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                      DxDataType,
+                                                                      DyDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      BiasDataType,
+                                                                      MeanVarDataType,
+                                                                      4,
+                                                                      NumReduceDim>(true,
+                                                                                    3,
+                                                                                    false,
+                                                                                    false,
+                                                                                    inOutLengths,
+                                                                                    reduceDims,
+                                                                                    false,
+                                                                                    epsilon,
+                                                                                    instance_index);
 
             EXPECT_TRUE(pass);
         }
@@ -103,3 +121,19 @@ TYPED_TEST(TestBatchNormBwdRank4, nchw)
     this->reduceDims = {0, 2, 3};
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batchnorm/batchnorm_fwd_rank_4.cpp b/test/batchnorm/batchnorm_fwd_rank_4.cpp
index 6bf635f0cd..8d81a3892c 100644
--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -16,6 +16,9 @@ using BF16 = ck::bhalf_t;
 using I8   = int8_t;
 using F64  = double;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchNormFwdRank4 : public ::testing::Test
 {
@@ -38,9 +41,14 @@ class TestBatchNormFwdRank4 : public ::testing::Test
     template <int NumReduceDim>
     void Run()
     {
-        for(auto& inOutLengths : list_of_lengths)
+        for(size_t i = 0; i < list_of_lengths.size(); i++)
         {
-            bool pass = true;
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& inOutLengths = list_of_lengths[i];
+            bool pass          = true;
 
             EXPECT_FALSE(reduceDims.size() != NumReduceDim);
 
@@ -61,7 +69,8 @@ class TestBatchNormFwdRank4 : public ::testing::Test
                                                                                    true,
                                                                                    true,
                                                                                    epsilon,
-                                                                                   averageFactor);
+                                                                                   averageFactor,
+                                                                                   instance_index);
 
             pass =
                 pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
@@ -80,7 +89,8 @@ class TestBatchNormFwdRank4 : public ::testing::Test
                                                                                    false,
                                                                                    false,
                                                                                    epsilon,
-                                                                                   averageFactor);
+                                                                                   averageFactor,
+                                                                                   instance_index);
 
             EXPECT_TRUE(pass);
         }
@@ -120,3 +130,19 @@ TYPED_TEST(TestBatchNormFwdRank4, nchw)
     this->reduceDims = {0, 2, 3};
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batchnorm/batchnorm_infer_rank_4.cpp b/test/batchnorm/batchnorm_infer_rank_4.cpp
index 0165192acf..41c9cdb94e 100644
--- a/test/batchnorm/batchnorm_infer_rank_4.cpp
+++ b/test/batchnorm/batchnorm_infer_rank_4.cpp
@@ -10,6 +10,9 @@
 
 #include "profiler/profile_batchnorm_infer_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 using F16  = ck::half_t;
 using F32  = float;
 using BF16 = ck::bhalf_t;
@@ -36,31 +39,38 @@ class TestBatchNormInferRank4 : public ::testing::Test
     template <int NumReduceDim>
     void Run()
     {
-        for(auto& inOutLengths : list_of_lengths)
+        for(size_t i = 0; i < list_of_lengths.size(); i++)
         {
-            bool pass = true;
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& inOutLengths = list_of_lengths[i];
+            bool pass          = true;
 
             EXPECT_FALSE(reduceDims.size() != NumReduceDim);
 
-            pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
-                                                                      YDataType,
-                                                                      AccDataType,
-                                                                      ScaleDataType,
-                                                                      BiasDataType,
-                                                                      MeanVarDataType,
-                                                                      4,
-                                                                      NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, epsilon);
+            pass = pass &&
+                   ck::profiler::profile_batchnorm_infer_impl<XDataType,
+                                                              YDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              4,
+                                                              NumReduceDim>(
+                       true, 3, false, false, inOutLengths, reduceDims, epsilon, instance_index);
 
-            pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
-                                                                      YDataType,
-                                                                      AccDataType,
-                                                                      ScaleDataType,
-                                                                      BiasDataType,
-                                                                      MeanVarDataType,
-                                                                      4,
-                                                                      NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, epsilon);
+            pass = pass &&
+                   ck::profiler::profile_batchnorm_infer_impl<XDataType,
+                                                              YDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              4,
+                                                              NumReduceDim>(
+                       true, 3, false, false, inOutLengths, reduceDims, epsilon, instance_index);
 
             EXPECT_TRUE(pass);
         }
@@ -100,3 +110,20 @@ TYPED_TEST(TestBatchNormInferRank4, nchw)
     this->reduceDims = {0, 2, 3};
     this->template Run<3>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 374e5b4990..b92888b1f1 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -3,7 +3,11 @@ add_subdirectory(gemm)
 add_subdirectory(gemm_weight_preshuffle)
 add_subdirectory(batched_gemm)
 add_subdirectory(grouped_gemm)
+add_subdirectory(grouped_gemm_preshuffle)
+add_subdirectory(grouped_gemm_multi_d)
 add_subdirectory(gemm_multi_d)
+add_subdirectory(gemm_multi_abd)
+add_subdirectory(gemm_streamk)
 add_subdirectory(data_type)
 add_subdirectory(container)
 add_subdirectory(elementwise)
@@ -23,3 +27,6 @@ add_subdirectory(add_rmsnorm2d_rdquant)
 add_subdirectory(gemm_block_scale)
 add_subdirectory(utility)
 add_subdirectory(reduce)
+add_subdirectory(epilogue)
+add_subdirectory(atomic_add_op)
+add_subdirectory(fmha)
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/CMakeLists.txt b/test/ck_tile/add_rmsnorm2d_rdquant/CMakeLists.txt
index 37774f7643..64672e200b 100644
--- a/test/ck_tile/add_rmsnorm2d_rdquant/CMakeLists.txt
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/CMakeLists.txt
@@ -18,7 +18,7 @@ function(create_tile_add_rmsnorm2d_rdquant_fwd SUFFIX)
     set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
 endfunction()
 
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     create_tile_add_rmsnorm2d_rdquant_fwd("fp16")
     create_tile_add_rmsnorm2d_rdquant_fwd("bf16")
 else()
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
index faa134e5c4..b7bd7ac7df 100644
--- a/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -80,55 +80,17 @@ struct add_rmsnorm2d_rdquant_fwd_traits_
     using InputDataType     = ck_tile::remove_cvref_t<InputDataType_>;
     using QuantizedDataType = ck_tile::remove_cvref_t<QuantizedDataType_>;
 
-    static constexpr auto WarpSize        = ck_tile::get_warp_size();
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+    using BlockTile      = ck_tile::sequence<Block_M, Block_N>;
+    using Vector         = ck_tile::sequence<1, Vector_N_>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
 
-    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
-    using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN      = kPadN_;
     static constexpr bool kSaveX     = kSaveX_;
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
index dd90034064..d997596414 100644
--- a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
@@ -58,7 +58,7 @@ float add_rmsnorm2d_rdquant_fwd_(const S& s, A a)
     using Kernel = ck_tile::AddRmsnorm2dRdquantFwd<Pipeline>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
diff --git a/test/ck_tile/atomic_add_op/CMakeLists.txt b/test/ck_tile/atomic_add_op/CMakeLists.txt
new file mode 100644
index 0000000000..5dfb4d9db3
--- /dev/null
+++ b/test/ck_tile/atomic_add_op/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_gtest_executable(test_atomic test_atomic.cpp)
+set(CTEST_OUTPUT_ON_FAILURE ON)
diff --git a/test/ck_tile/atomic_add_op/test_atomic.cpp b/test/ck_tile/atomic_add_op/test_atomic.cpp
new file mode 100644
index 0000000000..905f73afee
--- /dev/null
+++ b/test/ck_tile/atomic_add_op/test_atomic.cpp
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights
+// reserved.
+
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <hip/hip_runtime.h>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "test_atomic.hpp"
+
+struct AtomicKernelParam
+{
+    AtomicKernelParam(ck_tile::index_t m_, ck_tile::index_t n_) : m(m_), n(n_) {}
+    ck_tile::index_t m;
+    ck_tile::index_t n;
+};
+
+template <typename DataType_, ck_tile::index_t multiple_>
+class TestAtomicKernel : public ::testing::TestWithParam<std::tuple<int, int>>
+{
+    struct AtomicKernelWaveSize
+    {
+        using BlockWaves = ck_tile::sequence<2, 1>;
+        using BlockTile  = ck_tile::sequence<128, 8>;
+        using WaveTile   = ck_tile::sequence<64, 8>;
+    };
+
+    template <typename Config>
+    void RunTestImpl_(const AtomicKernelParam& params)
+    {
+        using XDataType = DataType_;
+
+        const ck_tile::index_t m = params.m;
+        const ck_tile::index_t n = params.n;
+
+        std::cout << "Input Tensor Dimensions: " << m << ", " << n << std::endl;
+
+        constexpr int dword_bytes = 4;
+        const int base_vec        = dword_bytes / static_cast<int>(sizeof(XDataType));
+        const int vec             = multiple_ * base_vec;
+
+        ASSERT_EQ(n % vec, 0) << " Row dimension must be divisible by vector width: n=" << n
+                              << " vec=" << vec << " (multiple=" << multiple_
+                              << ", base_vec=" << base_vec << ")";
+
+        // host tensors
+        ck_tile::HostTensor<XDataType> x_host_ref({m, n});
+        ck_tile::HostTensor<XDataType> x_host_dev({m, n});
+
+        // device buffers
+        ck_tile::DeviceMem x_dev_input(x_host_dev.get_element_space_size_in_bytes());
+        x_dev_input.SetZero();
+        x_host_ref.SetZero();
+
+        using BlockWaves = typename Config::BlockWaves;
+        using BlockTile  = typename Config::BlockTile;
+        using WaveTile   = typename Config::WaveTile;
+        using Vector     = ck_tile::sequence<1, vec>;
+
+        // Compile-time sanity: BlockTile == WaveTile * BlockWaves
+        static_assert(BlockTile::at(ck_tile::number<0>{}) ==
+                          WaveTile::at(ck_tile::number<0>{}) * BlockWaves::at(ck_tile::number<0>{}),
+                      "BlockTile.M must equal WaveTile.M * BlockWaves.M");
+        static_assert(BlockTile::at(ck_tile::number<1>{}) ==
+                          WaveTile::at(ck_tile::number<1>{}) * BlockWaves::at(ck_tile::number<1>{}),
+                      "BlockTile.N must equal WaveTile.N * BlockWaves.N");
+
+        std::cout << "Vector per thread = " << vec
+                  << "  BlockWaves=" << BlockWaves::at(ck_tile::number<0>{}) << "x"
+                  << BlockWaves::at(ck_tile::number<1>{})
+                  << "  WaveTile=" << WaveTile::at(ck_tile::number<0>{}) << "x"
+                  << WaveTile::at(ck_tile::number<1>{})
+                  << "  BlockTile=" << BlockTile::at(ck_tile::number<0>{}) << "x"
+                  << BlockTile::at(ck_tile::number<1>{}) << std::endl;
+
+        const ck_tile::index_t kGridSize =
+            ck_tile::integer_divide_ceil(m, BlockTile::at(ck_tile::number<0>{}));
+
+        using Shape   = ck_tile::AtomicKernelShape<BlockWaves, BlockTile, WaveTile, Vector>;
+        using Problem = ck_tile::AtomicKernelProblem<XDataType, Shape>;
+        using Kernel  = ck_tile::AtomicKernel<Problem>;
+
+        const ck_tile::index_t kBlockSize      = Kernel::BlockSize();
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+
+        (void)hipGetLastError(); // clear sticky
+
+        launch_kernel(ck_tile::stream_config{nullptr, false, 0, 0, 1},
+                      ck_tile::make_kernel<kBlockPerCu>(
+                          Kernel{},
+                          kGridSize,
+                          kBlockSize,
+                          0,
+                          static_cast<XDataType*>(x_dev_input.GetDeviceBuffer()),
+                          m,
+                          n));
+
+        ASSERT_EQ(hipPeekAtLastError(), hipSuccess)
+            << "hipPeekAtLastError: " << hipGetErrorString(hipGetLastError());
+        ASSERT_EQ(hipDeviceSynchronize(), hipSuccess) << "hipDeviceSynchronize failed";
+
+        // host reference computation
+        x_dev_input.FromDevice(x_host_dev.mData.data());
+        for(int i = 0; i < m; ++i)
+            for(int j = 0; j < n; ++j)
+                x_host_ref(i, j) = static_cast<XDataType>(1);
+
+        const bool pass = ck_tile::check_err(x_host_dev, x_host_ref);
+        EXPECT_TRUE(pass);
+    }
+
+    protected:
+    void RunTest(const AtomicKernelParam& params) { RunTestImpl_<AtomicKernelWaveSize>(params); }
+};
+
+class TestAtomicKernelHalf_1 : public TestAtomicKernel<ck_tile::half_t, 1>
+{
+};
+class TestAtomicKernelHalf_2 : public TestAtomicKernel<ck_tile::half_t, 2>
+{
+};
+class TestAtomicKernelHalf_4 : public TestAtomicKernel<ck_tile::half_t, 4>
+{
+};
+class TestAtomicKernelBF16_1 : public TestAtomicKernel<ck_tile::bf16_t, 1>
+{
+};
+class TestAtomicKernelBF16_2 : public TestAtomicKernel<ck_tile::bf16_t, 2>
+{
+};
+class TestAtomicKernelBF16_4 : public TestAtomicKernel<ck_tile::bf16_t, 4>
+{
+};
+class TestAtomicKernelBF8_1 : public TestAtomicKernel<ck_tile::bf8_t, 1>
+{
+};
+class TestAtomicKernelBF8_2 : public TestAtomicKernel<ck_tile::bf8_t, 2>
+{
+};
+class TestAtomicKernelFP8_1 : public TestAtomicKernel<ck_tile::fp8_t, 1>
+{
+};
+class TestAtomicKernelFP8_2 : public TestAtomicKernel<ck_tile::fp8_t, 2>
+{
+};
+class TestAtomicKernelFloat_1 : public TestAtomicKernel<float, 1>
+{
+};
+class TestAtomicKernelFloat_2 : public TestAtomicKernel<float, 2>
+{
+};
+class TestAtomicKernelFloat_4 : public TestAtomicKernel<float, 4>
+{
+};
+
+TEST_P(TestAtomicKernelHalf_1, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelHalf_2, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelHalf_4, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelBF16_1, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelBF16_2, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelBF16_4, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelBF8_1, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelBF8_2, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelFP8_1, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelFP8_2, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelFloat_1, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelFloat_2, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+TEST_P(TestAtomicKernelFloat_4, TestCorrectness)
+{
+    auto [M, N] = GetParam();
+    this->RunTest({M, N});
+}
+
+// Common parameter lists
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelHalf_1,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelHalf_2,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelHalf_4,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelBF16_1,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelBF16_2,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelBF16_4,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelBF8_1,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelBF8_2,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelFP8_1,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelFP8_2,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelFloat_1,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelFloat_2,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
+
+INSTANTIATE_TEST_SUITE_P(TestAtomicKernelSuite,
+                         TestAtomicKernelFloat_4,
+                         ::testing::Values(std::tuple{64, 8},
+                                           std::tuple{64, 16},
+                                           std::tuple{64, 32}));
diff --git a/test/ck_tile/atomic_add_op/test_atomic.hpp b/test/ck_tile/atomic_add_op/test_atomic.hpp
new file mode 100644
index 0000000000..27edae3a46
--- /dev/null
+++ b/test/ck_tile/atomic_add_op/test_atomic.hpp
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+
+namespace ck_tile {
+
+template <typename BlockWaves, typename BlockTile, typename WaveTile, typename Vector>
+struct AtomicKernelShape
+{
+    static constexpr index_t MWarps = BlockWaves::at(number<0>{});
+    static constexpr index_t NWarps = BlockWaves::at(number<1>{});
+
+    static constexpr index_t Block_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile::at(number<1>{});
+
+    static constexpr index_t Warp_M = WaveTile::at(number<0>{});
+    static constexpr index_t Warp_N = WaveTile::at(number<1>{});
+
+    static constexpr index_t Vector_M = Vector::at(number<0>{});
+    static constexpr index_t Vector_N = Vector::at(number<1>{});
+
+    static constexpr index_t WarpPerBlock_M = MWarps;
+    static constexpr index_t WarpPerBlock_N = NWarps;
+
+    static constexpr index_t RepeatInWarp =
+        Warp_M * Warp_N / Vector_M / Vector_N / ck_tile::get_warp_size();
+    static constexpr index_t RepeatInWarp_M =
+        (Warp_M / Vector_M > Warp_N / Vector_N) ? RepeatInWarp : 1;
+    static constexpr index_t RepeatInWarp_N =
+        (Warp_M / Vector_M > Warp_N / Vector_N) ? 1 : RepeatInWarp;
+
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M / RepeatInWarp_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N / RepeatInWarp_N;
+
+    static constexpr index_t Repeat_M = Block_M * RepeatInWarp_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N * RepeatInWarp_N / (WarpPerBlock_N * Warp_N);
+
+    static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
+
+    static constexpr index_t BlockSize = get_warp_size() * WaveNum;
+};
+
+template <typename XDataType_, typename BlockShape_>
+struct AtomicKernelProblem
+{
+    using XDataType  = remove_cvref_t<XDataType_>;
+    using BlockShape = remove_cvref_t<BlockShape_>;
+};
+
+template <typename Problem_>
+struct AtomicKernel
+{
+    using Problem   = remove_cvref_t<Problem_>;
+    using XDataType = typename Problem::XDataType;
+
+    static constexpr index_t kBlockSize = Problem::BlockShape::BlockSize;
+    CK_TILE_HOST static constexpr auto BlockSize()
+    {
+        return ck_tile::is_wave32() ? kBlockSize / 2 : kBlockSize;
+    }
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        constexpr index_t warp_size = get_warp_size();
+
+        constexpr index_t X0 = S::ThreadPerWarp_N;
+        constexpr index_t X1 = S::Vector_N;
+
+        constexpr index_t Y0 = S::WaveNum;
+        constexpr index_t Y2 = warp_size / X0;
+        constexpr index_t Y1 = S::Warp_M / Y2;
+
+        constexpr auto encoding =
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
+                                       tuple<sequence<0, 1>, sequence<1, 2>>,
+                                       tuple<sequence<0, 0>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<1, 1>>{};
+
+        return make_static_tile_distribution(encoding);
+    }
+
+    CK_TILE_DEVICE void operator()(XDataType* input, index_t M, index_t N) const
+    {
+        using S = typename Problem::BlockShape;
+
+        constexpr auto block_dims = make_tuple(number<S::Block_M>{}, number<S::Block_N>{});
+
+        const index_t iM = __builtin_amdgcn_readfirstlane(get_block_id() * S::Block_M);
+
+        const auto input_view =
+            make_naive_tensor_view<address_space_enum::global, memory_operation_enum::atomic_add>(
+                input, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+        auto input_window = make_tile_window(input_view, block_dims, {iM, 0});
+
+        const index_t num_iterations =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
+        using tmp_tile =
+            decltype(make_static_distributed_tensor<XDataType>(MakeTileDistribution<Problem>()));
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_iterations; iN++)
+        {
+            tmp_tile add_value_tile;
+            tile_elementwise_inout([](auto& c) { c = static_cast<XDataType>(1.0f); },
+                                   add_value_tile);
+
+            update_tile(input_window, add_value_tile);
+            __syncthreads();
+
+            move_tile_window(input_window, {0, S::Block_N});
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/test/ck_tile/batched_gemm/CMakeLists.txt b/test/ck_tile/batched_gemm/CMakeLists.txt
index 532ead1124..9bcbc7352e 100644
--- a/test/ck_tile/batched_gemm/CMakeLists.txt
+++ b/test/ck_tile/batched_gemm/CMakeLists.txt
@@ -1,4 +1,3 @@
-# Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_batched_gemm test_batched_gemm.cpp)
 endif()
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm.cpp b/test/ck_tile/batched_gemm/test_batched_gemm.cpp
index 3e3b821498..9bcf082e18 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm.cpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 
@@ -16,11 +16,11 @@ using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
 
 // clang-format off
 using KernelTypes = ::testing::Types<
-    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
-    // std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16>,
-    //std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16>//,
-    //std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16>
+    //             ALayout,     BLayout,     CLayout,   ADataType,   BDataType,   AccDataType,    CDataType
+    std::tuple<    Row,         Row,         Row,       F16,         F16,         F32,            F16>,
+    std::tuple<    Col,         Row,         Row,       F16,         F16,         F32,            F16>,
+    std::tuple<    Row,         Col,         Row,       F16,         F16,         F32,            F16>,
+    std::tuple<    Col,         Col,         Row,       F16,         F16,         F32,            F16>
     >;
 // clang-format on
 
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
index 74338ba383..8f24c9bfe1 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
@@ -1,9 +1,78 @@
 #pragma once
 
+struct GemmParams
+{
+    int M;
+    int N;
+    int K;
+    int batchCount;
+};
+
+struct StrideConfig
+{
+    int strideA;
+    int strideB;
+    int strideC;
+    int batchStrideA;
+    int batchStrideB;
+    int batchStrideC;
+};
+
 TYPED_TEST(TestCkTileBatchedGemm, Basic)
 {
-    constexpr int M = 256;
-    constexpr int N = 256;
-    constexpr int K = 512;
-    this->Run(M, N, K);
+    std::vector<GemmParams> gemmParams{{256, 256, 256, 1},
+                                       {256, 256, 256, 2},
+                                       {256, 256, 512, 2},
+                                       {256, 256, 64, 2},
+                                       {256, 256, 64, 3},
+                                       {256, 256, 64, 4},
+                                       {256, 256, 64, 8},
+                                       {256, 256, 64, 16}};
+
+    if(ck_tile::get_device_name() != "gfx950")
+    {
+        gemmParams.emplace_back(256, 256, 128, 2);
+    }
+
+    for(auto& params : gemmParams)
+    {
+        std::vector<StrideConfig> strideConfigs{{params.K,
+                                                 params.N,
+                                                 params.N,
+                                                 params.M * params.K,
+                                                 params.K * params.N,
+                                                 params.M * params.N},
+                                                {params.K,
+                                                 params.K,
+                                                 params.N,
+                                                 params.M * params.K,
+                                                 params.K * params.N,
+                                                 params.M * params.N},
+                                                {params.M,
+                                                 params.N,
+                                                 params.N,
+                                                 params.M * params.K,
+                                                 params.K * params.N,
+                                                 params.M * params.N},
+                                                {params.M,
+                                                 params.K,
+                                                 params.N,
+                                                 params.M * params.K,
+                                                 params.K * params.N,
+                                                 params.M * params.N}};
+
+        for(auto& conf : strideConfigs)
+        {
+            this->Run(params.M,
+                      params.N,
+                      params.K,
+                      conf.strideA,
+                      conf.strideB,
+                      conf.strideC,
+                      conf.batchStrideA,
+                      conf.batchStrideB,
+                      conf.batchStrideC,
+                      params.batchCount);
+        }
+    }
 }
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
index c4cb26bbee..f224f64136 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -27,21 +27,41 @@ class TestCkTileBatchedGemm : public ::testing::Test
     using DsLayout    = ck_tile::tuple<>;
     using DsDataType  = ck_tile::tuple<>;
 
-    template <typename ALayout, typename BLayout, typename CLayout>
+    struct GemmWarpConfig_Mfma
+    {
+        static constexpr ck_tile::index_t M_Tile      = 256;
+        static constexpr ck_tile::index_t N_Tile      = 256;
+        static constexpr ck_tile::index_t K_Tile      = 64;
+        static constexpr ck_tile::index_t M_Warp_Tile = 32;
+        static constexpr ck_tile::index_t N_Warp_Tile = 32;
+        static constexpr ck_tile::index_t K_Warp_Tile = 16;
+    };
+
+    struct GemmWarpConfig_Wmma
+    {
+        static constexpr ck_tile::index_t M_Tile      = 128;
+        static constexpr ck_tile::index_t N_Tile      = 128;
+        static constexpr ck_tile::index_t K_Tile      = 64;
+        static constexpr ck_tile::index_t M_Warp_Tile = 16;
+        static constexpr ck_tile::index_t N_Warp_Tile = 16;
+        static constexpr ck_tile::index_t K_Warp_Tile = 16;
+    };
+
+    template <typename GemmWarpConfig, typename ALayout, typename BLayout, typename CLayout>
     void invoke_batched_gemm(const ck_tile::BatchedGemmHostArgs& args,
                              const ck_tile::stream_config& s)
     {
-        constexpr ck_tile::index_t M_Tile = 256;
-        constexpr ck_tile::index_t N_Tile = 256;
-        constexpr ck_tile::index_t K_Tile = 64;
+        constexpr ck_tile::index_t M_Tile = GemmWarpConfig::M_Tile;
+        constexpr ck_tile::index_t N_Tile = GemmWarpConfig::N_Tile;
+        constexpr ck_tile::index_t K_Tile = GemmWarpConfig::K_Tile;
 
         constexpr ck_tile::index_t M_Warp = 2;
         constexpr ck_tile::index_t N_Warp = 2;
         constexpr ck_tile::index_t K_Warp = 1;
 
-        constexpr ck_tile::index_t M_Warp_Tile = 32;
-        constexpr ck_tile::index_t N_Warp_Tile = 32;
-        constexpr ck_tile::index_t K_Warp_Tile = 16;
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
 
         constexpr bool DoubleSmemBuffer = false;
 
@@ -260,9 +280,13 @@ class TestCkTileBatchedGemm : public ::testing::Test
                                           BatchStrideB,
                                           BatchStrideC,
                                           BatchCount};
-
-        invoke_batched_gemm<ALayout, BLayout, CLayout>(args,
-                                                       ck_tile::stream_config{nullptr, false});
+#if CK_TILE_USE_WMMA
+        invoke_batched_gemm<GemmWarpConfig_Wmma, ALayout, BLayout, CLayout>(
+            args, ck_tile::stream_config{nullptr, false});
+#else
+        invoke_batched_gemm<GemmWarpConfig_Mfma, ALayout, BLayout, CLayout>(
+            args, ck_tile::stream_config{nullptr, false});
+#endif
 
         std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K
                   << " StrideA =" << StrideA << " StrideB =" << StrideB << " StrideC =" << StrideC
diff --git a/test/ck_tile/batched_transpose/CMakeLists.txt b/test/ck_tile/batched_transpose/CMakeLists.txt
index 111b7c2bed..fb45caf044 100644
--- a/test/ck_tile/batched_transpose/CMakeLists.txt
+++ b/test/ck_tile/batched_transpose/CMakeLists.txt
@@ -1,5 +1,4 @@
-# Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx950")
     add_gtest_executable(test_ck_tile_batched_transpose test_batched_transpose.cpp)
     set_property(TARGET test_ck_tile_batched_transpose PROPERTY CXX_STANDARD 20)
 else()
diff --git a/test/ck_tile/container/CMakeLists.txt b/test/ck_tile/container/CMakeLists.txt
index 50670c83e4..f13f0dbedf 100644
--- a/test/ck_tile/container/CMakeLists.txt
+++ b/test/ck_tile/container/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_tuple_apply test_tuple_apply.cpp)
     if(result EQUAL 0)
         target_link_libraries(test_ck_tile_tuple_apply PRIVATE utility)
diff --git a/test/ck_tile/data_type/CMakeLists.txt b/test/ck_tile/data_type/CMakeLists.txt
index 384fd3c1c4..a5713ac55c 100644
--- a/test/ck_tile/data_type/CMakeLists.txt
+++ b/test/ck_tile/data_type/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_pk_int4 test_pk_int4.cpp)
 endif()
 if(GPU_TARGETS MATCHES "gfx95")
diff --git a/test/ck_tile/data_type/test_fp8.cpp b/test/ck_tile/data_type/test_fp8.cpp
index 49fd68591f..cb3e8ae53e 100644
--- a/test/ck_tile/data_type/test_fp8.cpp
+++ b/test/ck_tile/data_type/test_fp8.cpp
@@ -94,7 +94,9 @@ TYPED_TEST(ConvertTest, ToFp8)
     EXPECT_EQ(c(+ck_tile::numeric<SrcT>::denorm_min()), 0b0'0000'000);
     EXPECT_EQ(c(-ck_tile::numeric<SrcT>::denorm_min()), 0b1'0000'000);
 
-    // All values smaller than min f8 subnormal must be converted to f8 zero
+    // All values <= min f8 subnormal/2 must be converted to f8 zero
+    EXPECT_EQ(c(+0.001953125f * 0.6f), 0b0'0000'001);
+    EXPECT_EQ(c(-0.001953125f * 0.6f), 0b1'0000'001);
     constexpr int src_min_subnorm_exp =
         -(ck_tile::numeric_traits<SrcT>::bias + ck_tile::numeric_traits<SrcT>::mant - 1);
     constexpr int dst_min_subnorm_exp =
@@ -176,7 +178,9 @@ TYPED_TEST(ConvertTest, ToFp8)
     EXPECT_EQ(c(+ck_tile::numeric<SrcT>::denorm_min()), 0b0'0000'000);
     EXPECT_EQ(c(-ck_tile::numeric<SrcT>::denorm_min()), 0b0'0000'000);
 
-    // All values smaller than min f8 subnormal must be converted to f8 zero
+    // All values <= min f8 subnormal/2 must be converted to f8 zero
+    EXPECT_EQ(c(+0.0009765625f * 0.6f), 0b0'0000'001);
+    EXPECT_EQ(c(-0.0009765625f * 0.6f), 0b1'0000'001);
     constexpr int src_min_subnorm_exp =
         -(ck_tile::numeric_traits<SrcT>::bias + ck_tile::numeric_traits<SrcT>::mant - 1);
     constexpr int dst_min_subnorm_exp =
@@ -282,7 +286,9 @@ TYPED_TEST(ConvertTest, ToBf8)
     EXPECT_EQ(c(+ck_tile::numeric<SrcT>::denorm_min()), 0b0'00000'00);
     EXPECT_EQ(c(-ck_tile::numeric<SrcT>::denorm_min()), 0b1'00000'00);
 
-    // All values smaller than min f8 subnormal must be converted to f8 zero
+    // All values <= min f8 subnormal/2 must be converted to f8 zero
+    EXPECT_EQ(c(+1.52587890625e-05f * 0.6f), 0b0'0000'001);
+    EXPECT_EQ(c(-1.52587890625e-05f * 0.6f), 0b1'0000'001);
     constexpr int src_min_subnorm_exp =
         -(ck_tile::numeric_traits<SrcT>::bias + ck_tile::numeric_traits<SrcT>::mant - 1);
     constexpr int dst_min_subnorm_exp =
@@ -373,7 +379,9 @@ TYPED_TEST(ConvertTest, ToBf8)
     EXPECT_EQ(c(+ck_tile::numeric<SrcT>::denorm_min()), 0b0'00000'00);
     EXPECT_EQ(c(-ck_tile::numeric<SrcT>::denorm_min()), 0b0'00000'00);
 
-    // All values smaller than min f8 subnormal must be converted to f8 zero
+    // All values <= min f8 subnormal/2 must be converted to f8 zero
+    EXPECT_EQ(c(+7.62939453125e-06f * 0.6f), 0b0'0000'001);
+    EXPECT_EQ(c(-7.62939453125e-06f * 0.6f), 0b1'0000'001);
     constexpr int src_min_subnorm_exp =
         -(ck_tile::numeric_traits<SrcT>::bias + ck_tile::numeric_traits<SrcT>::mant - 1);
     constexpr int dst_min_subnorm_exp =
diff --git a/test/ck_tile/data_type/test_pk_fp4.cpp b/test/ck_tile/data_type/test_pk_fp4.cpp
index 15f027e95d..b1e981624a 100644
--- a/test/ck_tile/data_type/test_pk_fp4.cpp
+++ b/test/ck_tile/data_type/test_pk_fp4.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
+#include <vector>
 #include <hip/hip_runtime.h>
 
 #include "ck_tile/core.hpp"
@@ -29,6 +30,12 @@ TEST(PackedFp4, NumericLimits)
     EXPECT_EQ(ck_tile::numeric<pk_fp4_t>::epsilon(), pk_fp4_t{0b00010001});
     EXPECT_EQ(ck_tile::numeric<pk_fp4_t>::round_error(), pk_fp4_t{0b00010001});
 }
+TEST(PackedFp4, fill)
+{
+    std::vector<pk_fp4_t> v_fp4(4);
+    ck_tile::FillUniformDistribution<pk_fp4_t>{1.f, 1.f}(v_fp4);
+    EXPECT_EQ(v_fp4[0].get(), pk_fp4_t{0b00100010}.get());
+}
 TEST(PackedFp4, ConvertBasic)
 {
     EXPECT_EQ(ck_tile::convert_to_type<pk_fp4_t>(0.0f), pk_fp4_t{0b00000000}.get());
@@ -102,7 +109,7 @@ struct SrcPkfp4Dst
                 // ex: fp32_t -> fp4 -> bf16_t
                 dst[i] = toDST(toPF4(src[i]));
                 // ex: fp32x2_t -> pk_fp4 -> unpack<0> -> bf16_t
-                dst[i + 1] = toDST(toPF4(toPF4(input2).unpack(number<1>{})));
+                dst[i + 1] = toDST(toPF4(input2).unpack(number<1>{}));
             }
             else
             {
diff --git a/test/ck_tile/elementwise/CMakeLists.txt b/test/ck_tile/elementwise/CMakeLists.txt
index d22a30ff56..860a23a62a 100644
--- a/test/ck_tile/elementwise/CMakeLists.txt
+++ b/test/ck_tile/elementwise/CMakeLists.txt
@@ -1,6 +1,3 @@
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_elementwise_1d test_elementwise_1d.cpp)
-    if(result EQUAL 0)
-        target_link_libraries(test_ck_tile_elementwise_1d PRIVATE utility)
-    endif()
-endif()
\ No newline at end of file
+endif()
diff --git a/test/ck_tile/elementwise/test_elementwise_1d.cpp b/test/ck_tile/elementwise/test_elementwise_1d.cpp
index 3ce6e78d1d..7daba611e9 100644
--- a/test/ck_tile/elementwise/test_elementwise_1d.cpp
+++ b/test/ck_tile/elementwise/test_elementwise_1d.cpp
@@ -92,43 +92,52 @@ class TestCkTileElementwise : public ::testing::Test
 
         YDataType* p_y_device = static_cast<YDataType*>(d_y_mem.GetDeviceBuffer());
 
+        auto run_elementwise_kernel = [&](auto has_remainder) {
+            constexpr bool kPad = decltype(has_remainder)::value;
+            using Problem       = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                                      ComputeDataType,
+                                                                      YDataType,
+                                                                      TestElementWiseShape,
+                                                                      ElementwiseOpType,
+                                                                      kPad>;
+            using Policy        = ck_tile::ElementWiseDefaultPolicy;
+            ck_tile::ElementWiseKernel<Problem, Policy> ew_kernel;
+
+            ck_tile::index_t grid_size = (total_m_elements + TestElementWiseShape::kBlockM - 1) /
+                                         TestElementWiseShape::kBlockM;
+            dim3 grid(grid_size, 1, 1);
+            dim3 block                             = dim3(ew_kernel.BlockSize());
+            constexpr ck_tile::index_t kBlockPerCu = 1;
+
+            ck_tile::stream_config s{nullptr, false, 0}; // Default stream, no timing, no log
+
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<kBlockPerCu> // MinBlockPerCu
+                                   (ew_kernel,
+                                    grid,
+                                    block,
+                                    0, // actual shared memory
+                                    lens,
+                                    strides, // input strides
+                                    strides, // output strides
+                                    d_x_ptrs_tuple,
+                                    p_y_device));
+        };
+
         // Problem and Policy
-        using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
-                                                            ComputeDataType,
-                                                            YDataType,
-                                                            TestElementWiseShape,
-                                                            ElementwiseOpType>;
-        using Policy  = ck_tile::ElementWiseDefaultPolicy;
-
-        ck_tile::ElementWiseKernel<Problem, Policy> ew_kernel;
-
-        // Launch configuration
-        ck_tile::index_t grid_size =
-            (total_m_elements + TestElementWiseShape::kBlockM - 1) / TestElementWiseShape::kBlockM;
-        dim3 grid(grid_size, 1, 1);
-        dim3 block(TestElementWiseShape::kBlockSize, 1, 1);
-        constexpr ck_tile::index_t kBlockPerCu = 1;
-
-        ck_tile::stream_config s{nullptr, false, 0}; // Default stream, no timing, no log
-
-        // Check if the kernel configuration is supported
-        if(!ew_kernel.IsSupportedArgument(lens))
+        using BaseProblem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                TestElementWiseShape,
+                                                                ElementwiseOpType>;
+        if(total_m_elements % BaseProblem::BlockShape::kVectorM)
         {
-            throw std::runtime_error(
-                "The kernel configuration is not supported for the given input size.");
+            run_elementwise_kernel(std::true_type{});
+        }
+        else
+        {
+            run_elementwise_kernel(std::false_type{});
         }
-
-        ck_tile::launch_kernel(s,
-                               ck_tile::make_kernel<kBlockPerCu> // MinBlockPerCu
-                               (ew_kernel,
-                                grid,
-                                block,
-                                0, // actual shared memory
-                                lens,
-                                strides, // input strides
-                                strides, // output strides
-                                d_x_ptrs_tuple,
-                                p_y_device));
 
         d_y_mem.FromDevice(h_y.data());
 
diff --git a/test/ck_tile/epilogue/CMakeLists.txt b/test/ck_tile/epilogue/CMakeLists.txt
new file mode 100644
index 0000000000..4103b9d4db
--- /dev/null
+++ b/test/ck_tile/epilogue/CMakeLists.txt
@@ -0,0 +1 @@
+add_gtest_executable(test_ck_tile_cshuffle_epilogue test_cshuffle_epilogue.cpp)
diff --git a/test/ck_tile/epilogue/test_cshuffle_epilogue.cpp b/test/ck_tile/epilogue/test_cshuffle_epilogue.cpp
new file mode 100644
index 0000000000..d2ae4c6adc
--- /dev/null
+++ b/test/ck_tile/epilogue/test_cshuffle_epilogue.cpp
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_cshuffle_epilogue_util.hpp"
+#include <gtest/gtest.h>
+#include <hip/hip_runtime.h>
+
+using namespace ck_tile;
+
+class CShuffleEpilogueTest : public ::testing::Test
+{
+    protected:
+    void SetUp() override {}
+};
+
+TEST_F(CShuffleEpilogueTest, BasicHalfTest)
+{
+    // Basic test configuration with half_t data types
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using ODataType   = ck_tile::half_t;
+
+    constexpr index_t kMPerBlock = 256;
+    constexpr index_t kNPerBlock = 256;
+    constexpr index_t MWave      = 2;
+    constexpr index_t NWave      = 2;
+    constexpr index_t MPerXdl    = 32;
+    constexpr index_t NPerXdl    = 32;
+    constexpr index_t KPerXdl    = 8;
+
+    using TestProblem = SimpleCShuffleEpilogueProblem<ADataType,
+                                                      BDataType,
+                                                      AccDataType,
+                                                      ODataType,
+                                                      kMPerBlock,
+                                                      kNPerBlock,
+                                                      MWave,
+                                                      NWave,
+                                                      MPerXdl,
+                                                      NPerXdl,
+                                                      KPerXdl>;
+
+    auto result = run_cshuffle_epilogue_test<TestProblem, kMPerBlock, kNPerBlock>(ScaleType::None);
+    EXPECT_FLOAT_EQ(result[0], 2.0F) << "Basic CShuffleEpilogue test failed";
+}
+
+TEST_F(CShuffleEpilogueTest, BasicHalfTestWithScale)
+{
+    // Basic test configuration with half_t data types
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using ODataType   = ck_tile::half_t;
+
+    constexpr index_t kMPerBlock = 256;
+    constexpr index_t kNPerBlock = 256;
+    constexpr index_t MWave      = 2;
+    constexpr index_t NWave      = 2;
+    constexpr index_t MPerXdl    = 32;
+    constexpr index_t NPerXdl    = 32;
+    constexpr index_t KPerXdl    = 8;
+
+    using TestProblem = SimpleCShuffleEpilogueProblem<ADataType,
+                                                      BDataType,
+                                                      AccDataType,
+                                                      ODataType,
+                                                      kMPerBlock,
+                                                      kNPerBlock,
+                                                      MWave,
+                                                      NWave,
+                                                      MPerXdl,
+                                                      NPerXdl,
+                                                      KPerXdl>;
+
+    auto result =
+        run_cshuffle_epilogue_test<TestProblem, kMPerBlock, kNPerBlock>(ScaleType::RowCol);
+    EXPECT_FLOAT_EQ(result[0], 2.0F) << "RowCol CShuffleEpilogue test failed: first element not 2";
+    EXPECT_FLOAT_EQ(result[1], 4.0F)
+        << "RowCol CShuffleEpilogue test failed: second element not 2*2";
+}
+
+TEST_F(CShuffleEpilogueTest, BasicHalfTestWithTensorScale)
+{
+    // Basic test configuration with half_t data types
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using ODataType   = ck_tile::half_t;
+
+    constexpr index_t kMPerBlock = 256;
+    constexpr index_t kNPerBlock = 256;
+    constexpr index_t MWave      = 2;
+    constexpr index_t NWave      = 2;
+    constexpr index_t MPerXdl    = 32;
+    constexpr index_t NPerXdl    = 32;
+    constexpr index_t KPerXdl    = 8;
+
+    using TestProblem = SimpleCShuffleEpilogueProblem<ADataType,
+                                                      BDataType,
+                                                      AccDataType,
+                                                      ODataType,
+                                                      kMPerBlock,
+                                                      kNPerBlock,
+                                                      MWave,
+                                                      NWave,
+                                                      MPerXdl,
+                                                      NPerXdl,
+                                                      KPerXdl>;
+
+    auto result =
+        run_cshuffle_epilogue_test<TestProblem, kMPerBlock, kNPerBlock>(ScaleType::Tensor);
+    EXPECT_FLOAT_EQ(result[0], 4.0F)
+        << "TensorScale CShuffleEpilogue test failed: first element not 2*2=4";
+}
+
+int main(int argc, char** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/ck_tile/epilogue/test_cshuffle_epilogue_util.hpp b/test/ck_tile/epilogue/test_cshuffle_epilogue_util.hpp
new file mode 100644
index 0000000000..01e6c91c7c
--- /dev/null
+++ b/test/ck_tile/epilogue/test_cshuffle_epilogue_util.hpp
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <vector>
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+
+enum class ScaleType
+{
+    None,
+    RowCol,
+    Tensor
+};
+
+// Simple test kernel to invoke the CShuffleEpilogue
+template <typename Problem, index_t M, index_t N, ScaleType Scale>
+__global__ void test_cshuffle_epilogue_kernel(typename Problem::ODataType* __restrict__ output_data,
+                                              float* m_scale,
+                                              float* n_scale)
+{
+    using Epilogue = CShuffleEpilogue<Problem>;
+
+    static_assert(Problem::kMPerBlock <= M && Problem::kNPerBlock <= N,
+                  "Block size must fit in tensor dimensions");
+
+    // Allocate shared memory for epilogue
+    __shared__ char smem[Epilogue::GetSmemSize()];
+
+    // Create accumulator tile
+    constexpr auto lds_distribution_encode =
+        make_static_tile_distribution(Epilogue::MakeLdsDistributionEncode());
+    auto acc_tile =
+        make_static_distributed_tensor<typename Epilogue::AccDataType>(lds_distribution_encode);
+
+    // Fill acc_tile with a simple pattern
+    auto& acc_buffer = acc_tile.get_thread_buffer();
+    acc_buffer[0]    = 2.0F;
+
+    // Create output tensor view
+    auto output_tensor_view =
+        make_naive_tensor_view<address_space_enum::global>(output_data,
+                                                           make_tuple(M, N),
+                                                           make_tuple(N, 1),
+                                                           number<Epilogue::GetVectorSizeC()>{},
+                                                           number<1>{});
+
+    // Create output tile window
+    auto output_tile_window =
+        make_tile_window(output_tensor_view,
+                         make_tuple(number<Problem::kMPerBlock>{}, number<Problem::kNPerBlock>{}),
+                         {0, 0});
+
+    // Create empty D tensors tuple (we're ignoring ds_dram_windows for this test)
+    auto empty_ds = make_tuple();
+
+    // Call the epilogue
+    if constexpr(Scale == ScaleType::RowCol)
+    {
+        const auto m_scale_window = make_tile_window(
+            make_naive_tensor_view<address_space_enum::global>(
+                m_scale, make_tuple(M, N), make_tuple(1, 0), number<1>{}, number<1>{}),
+            make_tuple(number<Problem::kMPerBlock>{}, number<Problem::kNPerBlock>{}),
+            {0, 0});
+        const auto n_scale_window = make_tile_window(
+            make_naive_tensor_view<address_space_enum::global>(
+                n_scale, make_tuple(M, N), make_tuple(0, 1), number<1>{}, number<1>{}),
+            make_tuple(number<Problem::kMPerBlock>{}, number<Problem::kNPerBlock>{}),
+            {0, 0});
+        Epilogue{}(output_tile_window, acc_tile, empty_ds, smem, m_scale_window, n_scale_window);
+    }
+    else if constexpr(Scale == ScaleType::Tensor)
+    {
+        Epilogue{}(output_tile_window, acc_tile, empty_ds, smem, *m_scale, *n_scale);
+    }
+    else
+    {
+        Epilogue{}(output_tile_window, acc_tile, empty_ds, smem);
+    }
+}
+
+// Test configuration helper
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename ODataType,
+          index_t kM,
+          index_t kN,
+          index_t MWave,
+          index_t NWave,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t KPerXdl>
+using SimpleCShuffleEpilogueProblem =
+    CShuffleEpilogueProblem<ADataType,
+                            BDataType,
+                            ck_tile::tuple<>, // Empty Ds datatype tuple
+                            AccDataType,
+                            ODataType,
+                            ck_tile::tuple<>,                   // Empty Ds layout
+                            tensor_layout::gemm::RowMajor,      // ELayout
+                            ck_tile::element_wise::PassThrough, // CDElementwise
+                            kM,
+                            kN,
+                            MWave,
+                            NWave,
+                            MPerXdl,
+                            NPerXdl,
+                            KPerXdl,
+                            false, // isCTransposed,
+                            memory_operation_enum::set>;
+
+template <typename Problem, index_t M, index_t N>
+auto run_cshuffle_epilogue_test(ScaleType scale = ScaleType::None)
+{
+    using ODataType = typename Problem::ODataType;
+
+    constexpr index_t kMPerBlock = Problem::kMPerBlock;
+    constexpr index_t kNPerBlock = Problem::kNPerBlock;
+    constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    std::cout << "Running CShuffleEpilogue test with M=" << M << ", N=" << N
+              << ", MPerBlock=" << kMPerBlock << ", NPerBlock=" << kNPerBlock
+              << ", BlockSize=" << kBlockSize << std::endl;
+
+    // Allocate host memory
+    const size_t output_size = M * N;
+
+    std::vector<ODataType> host_output(output_size, static_cast<ODataType>(0));
+
+    // Allocate device memory
+    ODataType* device_output;
+
+    HIP_CHECK_ERROR(hipMalloc(&device_output, output_size * sizeof(ODataType)));
+
+    HIP_CHECK_ERROR(hipMemcpy(
+        device_output, host_output.data(), output_size * sizeof(ODataType), hipMemcpyHostToDevice));
+
+    // Launch kernel
+    dim3 gridSize(1, 1, 1);
+    dim3 blockSize(kBlockSize, 1, 1);
+
+    if(scale == ScaleType::RowCol)
+    {
+        float* m_scale;
+        float* n_scale;
+        std::vector<float> h_m_scale(M, 1.0F);
+        std::vector<float> h_n_scale(N, 1.0F);
+        h_n_scale[1] = 2.0F; // multiply one col only with 2
+        HIP_CHECK_ERROR(hipMalloc(&m_scale, M * sizeof(float)));
+        HIP_CHECK_ERROR(hipMalloc(&n_scale, N * sizeof(float)));
+        HIP_CHECK_ERROR(
+            hipMemcpy(m_scale, h_m_scale.data(), M * sizeof(float), hipMemcpyHostToDevice));
+        HIP_CHECK_ERROR(
+            hipMemcpy(n_scale, h_n_scale.data(), N * sizeof(float), hipMemcpyHostToDevice));
+        test_cshuffle_epilogue_kernel<Problem, M, N, ScaleType::RowCol>
+            <<<gridSize, blockSize>>>(device_output, m_scale, n_scale);
+    }
+    else if(scale == ScaleType::Tensor)
+    {
+        float* m_scale;
+        float* n_scale;
+        std::vector<float> h_m_scale(1, 2.0F);
+        std::vector<float> h_n_scale(1, 1.0F);
+        HIP_CHECK_ERROR(hipMalloc(&m_scale, sizeof(float)));
+        HIP_CHECK_ERROR(hipMalloc(&n_scale, sizeof(float)));
+        HIP_CHECK_ERROR(hipMemcpy(m_scale, h_m_scale.data(), sizeof(float), hipMemcpyHostToDevice));
+        HIP_CHECK_ERROR(hipMemcpy(n_scale, h_n_scale.data(), sizeof(float), hipMemcpyHostToDevice));
+        test_cshuffle_epilogue_kernel<Problem, M, N, ScaleType::Tensor>
+            <<<gridSize, blockSize>>>(device_output, m_scale, n_scale);
+    }
+    else
+    {
+        test_cshuffle_epilogue_kernel<Problem, M, N, ScaleType::None>
+            <<<gridSize, blockSize>>>(device_output, nullptr, nullptr);
+    }
+
+    // Check for kernel launch errors
+    HIP_CHECK_ERROR(hipGetLastError());
+    HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+    // Copy results back
+    HIP_CHECK_ERROR(hipMemcpy(
+        host_output.data(), device_output, output_size * sizeof(ODataType), hipMemcpyDeviceToHost));
+
+    // Cleanup
+    HIP_CHECK_ERROR(hipFree(device_output));
+
+    return host_output;
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/fmha/CMakeLists.txt b/test/ck_tile/fmha/CMakeLists.txt
new file mode 100644
index 0000000000..8e5cce4c0b
--- /dev/null
+++ b/test/ck_tile/fmha/CMakeLists.txt
@@ -0,0 +1,39 @@
+# Keep in sync with example/ck_tile/01_fmha/CMakeLists.txt
+if(NOT SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+    return()
+endif()
+
+set(FMHA_BWD_INSTANCES "tile_fmha_bwd_instances")
+set(FMHA_FWD_INSTANCES "tile_fmha_fwd_instances")
+
+add_gtest_executable(test_ck_tile_fmha_bwd_fp32 test_fmha_bwd_fp32.cpp)
+target_link_libraries(test_ck_tile_fmha_bwd_fp32 PRIVATE ${FMHA_BWD_INSTANCES})
+
+add_gtest_executable(test_ck_tile_fmha_bwd_bf16 test_fmha_bwd_bf16.cpp)
+target_link_libraries(test_ck_tile_fmha_bwd_bf16 PRIVATE ${FMHA_BWD_INSTANCES})
+
+add_gtest_executable(test_ck_tile_fmha_bwd_fp16 test_fmha_bwd_fp16.cpp)
+target_link_libraries(test_ck_tile_fmha_bwd_fp16 PRIVATE ${FMHA_BWD_INSTANCES})
+
+add_gtest_executable(test_ck_tile_fmha_fwd_fp32 test_fmha_fwd_fp32.cpp)
+target_link_libraries(test_ck_tile_fmha_fwd_fp32 PRIVATE ${FMHA_FWD_INSTANCES})
+
+add_gtest_executable(test_ck_tile_fmha_fwd_bf16 test_fmha_fwd_bf16.cpp)
+target_link_libraries(test_ck_tile_fmha_fwd_bf16 PRIVATE ${FMHA_FWD_INSTANCES})
+
+add_gtest_executable(test_ck_tile_fmha_fwd_fp16 test_fmha_fwd_fp16.cpp)
+target_link_libraries(test_ck_tile_fmha_fwd_fp16 PRIVATE ${FMHA_FWD_INSTANCES})
+
+add_gtest_executable(test_ck_tile_fmha_fwd_fp8 test_fmha_fwd_fp8.cpp)
+target_link_libraries(test_ck_tile_fmha_fwd_fp8 PRIVATE ${FMHA_FWD_INSTANCES})
+
+add_custom_target(test_ck_tile_fmha
+    DEPENDS
+        test_ck_tile_fmha_bwd_fp32
+        test_ck_tile_fmha_bwd_bf16
+        test_ck_tile_fmha_bwd_fp16
+        test_ck_tile_fmha_fwd_fp32
+        test_ck_tile_fmha_fwd_bf16
+        test_ck_tile_fmha_fwd_fp16
+        test_ck_tile_fmha_fwd_fp8
+)
diff --git a/test/ck_tile/fmha/test_fmha_bwd.inc b/test/ck_tile/fmha/test_fmha_bwd.inc
new file mode 100644
index 0000000000..704b5c7bf7
--- /dev/null
+++ b/test/ck_tile/fmha/test_fmha_bwd.inc
@@ -0,0 +1,347 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+using ::testing::Bool;
+using ::testing::Combine;
+using ::testing::TestWithParam;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+// Random seed used for initializing input tensors. 0 for non-deterministic seed
+CK_TILE_DECLARE_ENV_VAR(CK_TILE_TEST_SEED, uint64_t, 123456)
+
+// Whether to run long tests (from smoke_test_fwd.sh)
+CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_FMHA_LONG_TESTS)
+
+#define CHECK_RESULT(result)                                      \
+    do                                                            \
+    {                                                             \
+        if(result == bwd_result::no_instance)                     \
+            GTEST_SKIP() << "No instance for current parameters"; \
+        ASSERT_EQ(result, bwd_result::success);                   \
+    } while(0)
+
+const ck_tile::stream_config stream_config{
+    nullptr, // stream_id_
+    false,   // time_kernel_
+    1,       // log_level_
+    0,       // cold_niters_
+    1,       // nrepeat_
+    true,    // is_gpu_timer_
+    false,   // flush_cache_
+    1,       // rotating_count_
+};
+
+#define COMMON_ARGS                                                                           \
+    init_method, static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))), 1, \
+        stream_config
+
+auto EnableTestIf(bool condition)
+{
+    return ValuesIn(condition ? std::vector<bool>{true} : std::vector<bool>{});
+}
+
+class AllLong : public TestWithParam<std::tuple<bool,
+                                                std::tuple<int, int>,
+                                                bool,
+                                                mode_enum,
+                                                std::string,
+                                                float,
+                                                std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AllLong);
+
+// Test cases from example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+
+INSTANTIATE_TEST_SUITE_P(
+    TestCkTileFmhaBwd,
+    AllLong,
+    Combine(EnableTestIf(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_FMHA_LONG_TESTS))),
+            HDimValues,
+            Bool(),
+            ModeValues,
+            Values("n", "a"),
+            Values(0.0f, 0.2f),
+            Values(std::tuple{1, 4, 2, 259, -1, "0"},
+                   std::tuple{2, 2, -1, 516, 253, "0"},
+                   std::tuple{1, 4, 1, 500, 251, "1"},
+                   std::tuple{1, 2, -1, 900, 258, "2"},
+                   std::tuple{2, 1, -1, 987, 219, "t:128,30"},
+                   std::tuple{2, 3, 1, 244, 499, "b:4,35"})));
+
+TEST_P(AllLong, Test)
+{
+    auto [_, hdims, perm, mode, bias_str, p_drop, dims_mask]   = GetParam();
+    auto [hdim_q, hdim_v]                                      = hdims;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
+
+    auto result = fmha_bwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {seqlen_q},
+                                               {seqlen_k},
+                                               hdim_q,
+                                               hdim_v,
+                                               perm,     // i_perm
+                                               perm,     // o_perm
+                                               0,        // scale
+                                               bias_str, // bias_str
+                                               false,    // use_dbias
+                                               p_drop,   // p_drop
+                                               123,      // drop_seed
+                                               1024,     // drop_offset
+                                               true,     // drop_prefs
+                                               mask_str, // mask_str
+                                               false,    // deterministic
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+class HDimPadding
+    : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                      bool,
+                                      mode_enum,
+                                      std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
+                         HDimPadding,
+                         Combine(Values(std::tuple{24, 48},
+                                        std::tuple{48, 48},
+                                        std::tuple{72, 72},
+                                        std::tuple{96, 96},
+                                        std::tuple{120, 160},
+                                        std::tuple{256, 108},
+                                        std::tuple{40, 64}),
+                                 Bool(),
+                                 ModeValues,
+                                 Values(std::tuple{1, 4, 2, 480, -1, "0"},
+                                        std::tuple{2, 2, -1, 300, 400, "t:64,64"},
+                                        std::tuple{1, 4, 1, 512, 201, "1"},
+                                        std::tuple{1, 2, -1, 900, 256, "0"},
+                                        std::tuple{2, 1, -1, 256, 256, "1"})));
+
+TEST_P(HDimPadding, Test)
+{
+    auto [hdims, perm, mode, dims_mask]                        = GetParam();
+    auto [hdim_q, hdim_v]                                      = hdims;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
+
+    auto result = fmha_bwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {seqlen_q},
+                                               {seqlen_k},
+                                               hdim_q,
+                                               hdim_v,
+                                               perm,     // i_perm
+                                               perm,     // o_perm
+                                               0,        // scale
+                                               "n",      // bias_str
+                                               false,    // use_dbias
+                                               0.0f,     // p_drop
+                                               0,        // drop_seed
+                                               0,        // drop_offset
+                                               false,    // drop_prefs
+                                               mask_str, // mask_str
+                                               false,    // deterministic
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+class ElementwiseBias
+    : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                      bool,
+                                      mode_enum,
+                                      std::string,
+                                      bool,
+                                      std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
+                         ElementwiseBias,
+                         Combine(HDimValues,
+                                 Bool(), // layouts of bias and dbias are controlled by i_perm
+                                 ModeValues,
+                                 Values("e:0", "e:1", "e:2"),
+                                 Bool(),
+                                 Values(std::tuple{1, 4, 2, 1024, 100, "0"},
+                                        std::tuple{3, 2, -1, 128, 256, "2"},
+                                        std::tuple{2, 2, -1, 130, 499, "t:50,64"})));
+
+TEST_P(ElementwiseBias, Test)
+{
+    auto [hdims, i_perm, mode, bias_str, use_dbias, dims_mask] = GetParam();
+    auto [hdim_q, hdim_v]                                      = hdims;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
+
+    auto result = fmha_bwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {seqlen_q},
+                                               {seqlen_k},
+                                               hdim_q,
+                                               hdim_v,
+                                               i_perm,    // i_perm
+                                               false,     // o_perm
+                                               0,         // scale
+                                               bias_str,  // bias_str
+                                               use_dbias, // use_dbias
+                                               0.0f,      // p_drop
+                                               123,       // drop_seed
+                                               1024,      // drop_offset
+                                               true,      // drop_prefs
+                                               mask_str,  // mask_str
+                                               false,     // deterministic
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+class Alibi : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                              mode_enum,
+                                              std::string,
+                                              std::tuple<int, int, int, int, int>,
+                                              std::string>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
+                         Alibi,
+                         Combine(HDimValues,
+                                 ModeValues,
+                                 Values("a:0", "a:1"),
+                                 Values(std::tuple{1, 3, 3, 1024, 1000},
+                                        std::tuple{3, 5, 5, 128, 256},
+                                        std::tuple{2, 8, 4, 130, 320}),
+                                 Values("0", "t", "b", "t:50,64", "b:32,40")));
+
+TEST_P(Alibi, Test)
+{
+    auto [hdims, mode, bias_str, dims, mask_str]     = GetParam();
+    auto [hdim_q, hdim_v]                            = hdims;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k] = dims;
+
+    auto result = fmha_bwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {seqlen_q},
+                                               {seqlen_k},
+                                               hdim_q,
+                                               hdim_v,
+                                               true,     // i_perm
+                                               true,     // o_perm
+                                               0,        // scale
+                                               bias_str, // bias_str
+                                               false,    // use_dbias
+                                               0.0f,     // p_drop
+                                               0,        // drop_seed
+                                               0,        // drop_offset
+                                               false,    // drop_prefs
+                                               mask_str, // mask_str
+                                               false,    // deterministic
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+class Dropout : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                                mode_enum,
+                                                float,
+                                                std::tuple<uint64_t, uint64_t, bool>,
+                                                std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
+                         Dropout,
+                         Combine(HDimValues,
+                                 ModeValues,
+                                 Values(0.123f, 0.5f),
+                                 Values(std::tuple{10, 123, false},
+                                        std::tuple{34534564645, 7876878876864, true}),
+                                 Values(std::tuple{2, 6, 2, 180, 512, "0"},
+                                        std::tuple{3, 2, 2, 256, 128, "1"},
+                                        std::tuple{4, 2, 1, 100, 768, "2"})));
+
+TEST_P(Dropout, Test)
+{
+    auto [hdims, mode, p_drop, drop_seed_offset_prefs, dims_mask] = GetParam();
+    auto [hdim_q, hdim_v]                                         = hdims;
+    auto [drop_seed, drop_offset, drop_prefs]                     = drop_seed_offset_prefs;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str]    = dims_mask;
+
+    auto result = fmha_bwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {seqlen_q},
+                                               {seqlen_k},
+                                               hdim_q,
+                                               hdim_v,
+                                               true,        // i_perm
+                                               true,        // o_perm
+                                               0.1f,        // scale
+                                               "n",         // bias_str
+                                               false,       // use_dbias
+                                               p_drop,      // p_drop
+                                               drop_seed,   // drop_seed
+                                               drop_offset, // drop_offset
+                                               drop_prefs,  // drop_prefs
+                                               mask_str,    // mask_str
+                                               false,       // deterministic
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+class Deterministic
+    : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                      bool,
+                                      mode_enum,
+                                      std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
+                         Deterministic,
+                         Combine(HDimValues,
+                                 Bool(),
+                                 ModeValues,
+                                 Values(std::tuple{2, 6, 2, 180, 512, "0"},
+                                        std::tuple{3, 3, 1, 256, 128, "1"},
+                                        std::tuple{4, 2, 2, 768, 100, "2"})));
+
+TEST_P(Deterministic, Test)
+{
+    auto [hdims, i_perm, mode, dims_mask]                      = GetParam();
+    auto [hdim_q, hdim_v]                                      = hdims;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
+
+    auto result = fmha_bwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {seqlen_q},
+                                               {seqlen_k},
+                                               hdim_q,
+                                               hdim_v,
+                                               i_perm,   // i_perm
+                                               true,     // o_perm
+                                               0,        // scale
+                                               "n",      // bias_str
+                                               false,    // use_dbias
+                                               0.0f,     // p_drop
+                                               0,        // drop_seed
+                                               0,        // drop_offset
+                                               false,    // drop_prefs
+                                               mask_str, // mask_str
+                                               true,     // deterministic
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
diff --git a/test/ck_tile/fmha/test_fmha_bwd_bf16.cpp b/test/ck_tile/fmha/test_fmha_bwd_bf16.cpp
new file mode 100644
index 0000000000..077e45a10d
--- /dev/null
+++ b/test/ck_tile/fmha/test_fmha_bwd_bf16.cpp
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "example/ck_tile/01_fmha/fmha_bwd.hpp"
+#include "example/ck_tile/01_fmha/fmha_bwd_runner.hpp"
+
+#include "gtest/gtest.h"
+
+using DataTypeConfig = FmhaBwdBf16;
+
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+const auto HDimValues =
+    Values(std::tuple{32, -1}, std::tuple{64, -1}, std::tuple{128, -1}, std::tuple{256, -1});
+
+const auto ModeValues = Values(mode_enum::batch, mode_enum::group);
+
+constexpr auto init_method = "uf";
+
+#include "test_fmha_bwd.inc"
diff --git a/test/ck_tile/fmha/test_fmha_bwd_fp16.cpp b/test/ck_tile/fmha/test_fmha_bwd_fp16.cpp
new file mode 100644
index 0000000000..86621b0494
--- /dev/null
+++ b/test/ck_tile/fmha/test_fmha_bwd_fp16.cpp
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "example/ck_tile/01_fmha/fmha_bwd.hpp"
+#include "example/ck_tile/01_fmha/fmha_bwd_runner.hpp"
+
+#include "gtest/gtest.h"
+
+using DataTypeConfig = FmhaBwdFp16;
+
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+const auto HDimValues =
+    Values(std::tuple{32, -1}, std::tuple{64, -1}, std::tuple{128, -1}, std::tuple{256, -1});
+
+const auto ModeValues = Values(mode_enum::batch, mode_enum::group);
+
+constexpr auto init_method = "uf";
+
+#include "test_fmha_bwd.inc"
diff --git a/test/ck_tile/fmha/test_fmha_bwd_fp32.cpp b/test/ck_tile/fmha/test_fmha_bwd_fp32.cpp
new file mode 100644
index 0000000000..d409d0dd30
--- /dev/null
+++ b/test/ck_tile/fmha/test_fmha_bwd_fp32.cpp
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "example/ck_tile/01_fmha/fmha_bwd.hpp"
+#include "example/ck_tile/01_fmha/fmha_bwd_runner.hpp"
+
+#include "gtest/gtest.h"
+
+using DataTypeConfig = FmhaBwdFp32;
+
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+const auto HDimValues = Values(std::tuple{32, -1}, std::tuple{64, -1}, std::tuple{128, -1});
+
+const auto ModeValues = Values(mode_enum::batch, mode_enum::group);
+
+constexpr std::string init_method = "uf";
+
+#include "test_fmha_bwd.inc"
diff --git a/test/ck_tile/fmha/test_fmha_fwd.inc b/test/ck_tile/fmha/test_fmha_fwd.inc
new file mode 100644
index 0000000000..ccca5cf969
--- /dev/null
+++ b/test/ck_tile/fmha/test_fmha_fwd.inc
@@ -0,0 +1,1084 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+using ::testing::Bool;
+using ::testing::Combine;
+using ::testing::TestWithParam;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+// Random seed used for initializing input tensors. 0 for non-deterministic seed
+CK_TILE_DECLARE_ENV_VAR(CK_TILE_TEST_SEED, uint64_t, 123456)
+
+// Whether to run long tests (from smoke_test_fwd.sh)
+CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_FMHA_LONG_TESTS)
+
+#define CHECK_RESULT(result)                                      \
+    do                                                            \
+    {                                                             \
+        if(result == fwd_result::no_instance)                     \
+            GTEST_SKIP() << "No instance for current parameters"; \
+        ASSERT_EQ(result, fwd_result::success);                   \
+    } while(0)
+
+const ck_tile::stream_config stream_config{
+    nullptr, // stream_id_
+    false,   // time_kernel_
+    1,       // log_level_
+    0,       // cold_niters_
+    1,       // nrepeat_
+    true,    // is_gpu_timer_
+    false,   // flush_cache_
+    1,       // rotating_count_
+};
+
+#define COMMON_ARGS                                                                           \
+    init_method, static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))), 1, \
+        stream_config
+
+auto EnableTestIf(bool condition)
+{
+    return ValuesIn(condition ? std::vector<bool>{true} : std::vector<bool>{});
+}
+
+class AllLong : public TestWithParam<
+                    std::tuple<bool,
+                               std::tuple<int, int>,
+                               bool,
+                               bool,
+                               mode_enum,
+                               bool,
+                               std::string,
+                               float,
+                               std::tuple<int, int, int, int, int, int, int, int, std::string>>>
+{
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AllLong);
+
+// Test cases from example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+
+INSTANTIATE_TEST_SUITE_P(
+    TestCkTileFmhaFwd,
+    AllLong,
+    Combine(EnableTestIf(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_FMHA_LONG_TESTS))),
+            HDimValues,
+            Bool(),
+            IsVRowmajorValues,
+            ModeValues,
+            Bool(),
+            Values("n", "e", "a"),
+            Values(0.0f, 0.2f),
+            Values(std::tuple{2, 2, 1, 16, -1, 55, 256, -1, "0"},
+                   std::tuple{1, 3, -1, -1, -1, 100, 51, -1, "0"},
+                   std::tuple{2, 1, -1, 16, -1, 99, 256, -1, "1"},
+                   std::tuple{1, 2, 1, -1, -1, 1024, 256, -1, "2"},
+                   std::tuple{2, 1, -1, -1, 24, 3, 99, -1, "2"},
+                   std::tuple{3, 2, 1, -1, -1, 200, 520, -1, "t:128,30"},
+                   std::tuple{2, 1, -1, -1, -1, 99, 32, -1, "b:4,35"},
+                   std::tuple{1, 2, 1, -1, -1, 33, 0, -1, "2"},
+                   std::tuple{1, 2, 1, -1, -1, 1, 10, 32, "2"})));
+
+TEST_P(AllLong, Test)
+{
+    auto [_, hdims, perm, is_v_rowmajor, mode, lse, bias_str, p_drop, dims_mask] = GetParam();
+    auto [hdim_q, hdim_v]                                                        = hdims;
+    auto [batch, nhead, nhead_k, hdim_q_, hdim_v_, seqlen_q, seqlen_k, seqlen_kpad, mask_str] =
+        dims_mask;
+
+    hdim_q = hdim_q_ == -1 ? hdim_q : hdim_q_;
+    hdim_v = hdim_v_ == -1 ? hdim_v : hdim_v_;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {adjust_seqlen(seqlen_q)},
+                                               {adjust_seqlen(seqlen_k)},
+                                               hdim_q,
+                                               hdim_v,
+                                               0,             // seqlen_knew
+                                               {-1},          // seqlen_qpads
+                                               {seqlen_kpad}, // seqlen_kpads
+                                               {},            // q_eff_lens_per_batch
+                                               {},            // kv_eff_lens_per_batch
+                                               0,             // rotary_dim
+                                               perm,          // i_perm
+                                               perm,          // o_perm
+                                               0,             // scale_s
+                                               0,             // logits_soft_cap
+                                               is_v_rowmajor, // is_v_rowmajor
+                                               lse,           // lse
+                                               0,             // page_block_size
+                                               false,         // use_cache_batch_idx
+                                               bias_str,      // bias_str
+                                               p_drop,        // p_drop
+                                               123,           // drop_seed
+                                               1024,          // drop_offset
+                                               false,         // drop_prefs
+                                               mask_str,      // mask_str
+                                               squant,
+                                               true, // is_rotary_interleaved
+                                               1,    // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+// ---------------------------------------------------------------
+// Negative tests: padding not supported with appendkv/splitkv/pagedkv
+// ---------------------------------------------------------------
+
+#if CK_TILE_FMHA_FWD_APPENDKV_API
+TEST(TestCkTileFmhaFwd, AppendKvWithBatchEffLensShouldFail)
+{
+    // batch mode effective lengths simulate padding
+    auto result = fmha_fwd_run<DataTypeConfig>(
+        mode_enum::batch,
+        2,          // batch
+        4,          // nhead
+        -1,         // nhead_k
+        {128},      // seqlen_qs
+        {128},      // seqlen_ks
+        64,         // hdim_q
+        64,         // hdim_v
+        32,         // seqlen_knew -> triggers appendkv
+        {},         // seqlen_qpads
+        {},         // seqlen_kpads
+        {100, 120}, // q_eff_lens_per_batch
+        {90, 110},  // kv_eff_lens_per_batch
+        0,          // rotary_dim
+        true,       // i_perm
+        true,       // o_perm
+        0,          // scale_s
+        0,          // logits_soft_cap
+        def_is_v_rowmajor,
+        def_lse,
+        0,     // page_block_size
+        false, // use_cache_batch_idx
+        "n",   // bias
+        0.0f,  // p_drop
+        0,     // drop_seed
+        0,     // drop_offset
+        false, // drop_prefs
+        "0",   // mask
+        squant,
+        true, // is_rotary_interleaved
+        1,    // num_splits
+        init_method,
+        static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
+        0,
+        stream_config);
+    ASSERT_EQ(result, fwd_result::invalid_args);
+}
+#endif
+
+#if CK_TILE_FMHA_FWD_SPLITKV_API
+TEST(TestCkTileFmhaFwd, SplitKvWithGroupPaddingShouldFail)
+{
+    // group mode physical padding
+    auto result = fmha_fwd_run<DataTypeConfig>(
+        mode_enum::group,
+        2,          // batch
+        4,          // nhead
+        -1,         // nhead_k
+        {96, 120},  // seqlen_qs logical
+        {96, 120},  // seqlen_ks logical
+        64,         // hdim_q
+        64,         // hdim_v
+        0,          // seqlen_knew
+        {128, 128}, // seqlen_qpads
+        {128, 128}, // seqlen_kpads
+        {},         // q_eff
+        {},         // kv_eff
+        0,          // rotary_dim
+        true,       // i_perm
+        true,       // o_perm
+        0,          // scale_s
+        0,          // logits_soft_cap
+        def_is_v_rowmajor,
+        def_lse,
+        0,     // page_block_size
+        false, // use_cache_batch_idx
+        "n",   // bias
+        0.0f,
+        0,
+        0,
+        false,
+        "0",
+        squant,
+        true,
+        2, // num_splits (>1 triggers splitkv)
+        init_method,
+        static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
+        0,
+        stream_config);
+    ASSERT_EQ(result, fwd_result::invalid_args);
+}
+#endif
+
+#if CK_TILE_FMHA_FWD_PAGEDKV_API
+TEST(TestCkTileFmhaFwd, PagedKvWithGroupPaddingShouldFail)
+{
+    auto result = fmha_fwd_run<DataTypeConfig>(
+        mode_enum::group,
+        2,
+        4,
+        -1,
+        {80, 100},
+        {80, 100},
+        64,
+        64,
+        0,         // seqlen_knew
+        {96, 128}, // seqlen_qpads
+        {96, 128}, // seqlen_kpads
+        {},
+        {},
+        0,
+        true,
+        true,
+        0,
+        0,
+        def_is_v_rowmajor,
+        def_lse,
+        128, // page_block_size triggers pagedkv
+        false,
+        "n",
+        0.0f,
+        0,
+        0,
+        false,
+        "0",
+        squant,
+        true,
+        1,
+        init_method,
+        static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
+        0,
+        stream_config);
+    ASSERT_EQ(result, fwd_result::invalid_args);
+}
+#endif
+
+class HDimPadding
+    : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                      bool,
+                                      bool,
+                                      mode_enum,
+                                      std::tuple<int, int, int, int, int, int, std::string>>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaFwd,
+                         HDimPadding,
+                         Combine(Values(std::tuple{24, 48},
+                                        std::tuple{120, 160},
+                                        std::tuple{256, 108},
+                                        std::tuple{40, 64}),
+                                 Bool(),
+                                 IsVRowmajorValues,
+                                 ModeValues,
+                                 Values(std::tuple{1, 4, 2, 480, -1, -1, "0"},
+                                        std::tuple{2, 2, -1, 300, 400, 512, "t:64,64"},
+                                        std::tuple{1, 4, 1, 512, 201, 256, "1"},
+                                        std::tuple{1, 2, -1, 900, 256, -1, "0"},
+                                        std::tuple{2, 1, -1, 256, 256, -1, "1"})));
+
+TEST_P(HDimPadding, Test)
+{
+    auto [hdims, perm, is_v_rowmajor, mode, dims_mask]                      = GetParam();
+    auto [hdim_q, hdim_v]                                                   = hdims;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, seqlen_kpad, mask_str] = dims_mask;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {adjust_seqlen(seqlen_q)},
+                                               {adjust_seqlen(seqlen_k)},
+                                               hdim_q,
+                                               hdim_v,
+                                               0,             // seqlen_knew
+                                               {-1},          // seqlen_qpads
+                                               {seqlen_kpad}, // seqlen_kpads
+                                               {},            // q_eff_lens_per_batch
+                                               {},            // kv_eff_lens_per_batch
+                                               0,             // rotary_dim
+                                               perm,          // i_perm
+                                               perm,          // o_perm
+                                               0,             // scale_s
+                                               0,             // logits_soft_cap
+                                               is_v_rowmajor, // is_v_rowmajor
+                                               def_lse,       // lse
+                                               0,             // page_block_size
+                                               false,         // use_cache_batch_idx
+                                               "n",           // bias_str
+                                               0.0f,          // p_drop
+                                               0,             // drop_seed
+                                               0,             // drop_offset
+                                               false,         // drop_prefs
+                                               mask_str,      // mask_str
+                                               squant,
+                                               true, // is_rotary_interleaved
+                                               1,    // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+class ElementwiseBias
+    : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                      bool,
+                                      mode_enum,
+                                      std::string,
+                                      std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaFwd,
+                         ElementwiseBias,
+                         Combine(HDimValues,
+                                 Bool(), // layout of bias is controlled by i_perm
+                                 ModeValues,
+                                 Values("e:0", "e:1", "e:2"),
+                                 Values(std::tuple{1, 4, 2, 1024, 100, "0"},
+                                        std::tuple{3, 2, -1, 128, 256, "2"},
+                                        std::tuple{2, 2, -1, 130, 499, "t:50,64"})));
+
+TEST_P(ElementwiseBias, Test)
+{
+    auto [hdims, i_perm, mode, bias_str, dims_mask]            = GetParam();
+    auto [hdim_q, hdim_v]                                      = hdims;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {adjust_seqlen(seqlen_q)},
+                                               {adjust_seqlen(seqlen_k)},
+                                               hdim_q,
+                                               hdim_v,
+                                               0,                 // seqlen_knew
+                                               {-1},              // seqlen_qpads
+                                               {-1},              // seqlen_kpads
+                                               {},                // q_eff_lens_per_batch
+                                               {},                // kv_eff_lens_per_batch
+                                               0,                 // rotary_dim
+                                               i_perm,            // i_perm
+                                               false,             // o_perm
+                                               0,                 // scale_s
+                                               0,                 // logits_soft_cap
+                                               def_is_v_rowmajor, // is_v_rowmajor
+                                               def_lse,           // lse
+                                               0,                 // page_block_size
+                                               false,             // use_cache_batch_idx
+                                               bias_str,          // bias_str
+                                               0.0f,              // p_drop
+                                               0,                 // drop_seed
+                                               0,                 // drop_offset
+                                               false,             // drop_prefs
+                                               mask_str,          // mask_str
+                                               squant,
+                                               true, // is_rotary_interleaved
+                                               1,    // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+class Alibi : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                              mode_enum,
+                                              std::string,
+                                              std::tuple<int, int, int, int, int>,
+                                              std::string>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaFwd,
+                         Alibi,
+                         Combine(HDimValues,
+                                 ModeValues,
+                                 Values("a:0", "a:1"),
+                                 Values(std::tuple{1, 3, 3, 1024, 1000},
+                                        std::tuple{3, 5, 5, 128, 256},
+                                        std::tuple{2, 8, 2, 300, 355}),
+                                 Values("0", "t", "b", "t:50,64", "b:32,40")));
+
+TEST_P(Alibi, Test)
+{
+    auto [hdims, mode, bias_str, dims, mask_str]     = GetParam();
+    auto [hdim_q, hdim_v]                            = hdims;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k] = dims;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {adjust_seqlen(seqlen_q)},
+                                               {adjust_seqlen(seqlen_k)},
+                                               hdim_q,
+                                               hdim_v,
+                                               0,                 // seqlen_knew
+                                               {-1},              // seqlen_qpads
+                                               {-1},              // seqlen_kpads
+                                               {},                // q_eff_lens_per_batch
+                                               {},                // kv_eff_lens_per_batch
+                                               0,                 // rotary_dim
+                                               true,              // i_perm
+                                               true,              // o_perm
+                                               0,                 // scale_s
+                                               0,                 // logits_soft_cap
+                                               def_is_v_rowmajor, // is_v_rowmajor
+                                               def_lse,           // lse
+                                               0,                 // page_block_size
+                                               false,             // use_cache_batch_idx
+                                               bias_str,          // bias_str
+                                               0.0f,              // p_drop
+                                               0,                 // drop_seed
+                                               0,                 // drop_offset
+                                               false,             // drop_prefs
+                                               mask_str,          // mask_str
+                                               squant,
+                                               true, // is_rotary_interleaved
+                                               1,    // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+class Dropout : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                                mode_enum,
+                                                float,
+                                                std::tuple<uint64_t, uint64_t, bool>,
+                                                std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaFwd,
+                         Dropout,
+                         Combine(HDimValues,
+                                 ModeValues,
+                                 Values(0.123f, 0.5f),
+                                 Values(std::tuple{10, 123, false},
+                                        std::tuple{34534564645, 7876878876864, true}),
+                                 Values(std::tuple{2, 4, 2, 280, 512, "0"},
+                                        std::tuple{3, 2, 2, 256, 128, "1"},
+                                        std::tuple{4, 3, 1, 100, 768, "2"})));
+
+TEST_P(Dropout, Test)
+{
+    auto [hdims, mode, p_drop, drop_seed_offset_prefs, dims_mask] = GetParam();
+    auto [hdim_q, hdim_v]                                         = hdims;
+    auto [drop_seed, drop_offset, drop_prefs]                     = drop_seed_offset_prefs;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str]    = dims_mask;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {adjust_seqlen(seqlen_q)},
+                                               {adjust_seqlen(seqlen_k)},
+                                               hdim_q,
+                                               hdim_v,
+                                               0,                 // seqlen_knew
+                                               {-1},              // seqlen_qpads
+                                               {-1},              // seqlen_kpads
+                                               {},                // q_eff_lens_per_batch
+                                               {},                // kv_eff_lens_per_batch
+                                               0,                 // rotary_dim
+                                               false,             // i_perm
+                                               false,             // o_perm
+                                               0,                 // scale_s
+                                               0,                 // logits_soft_cap
+                                               def_is_v_rowmajor, // is_v_rowmajor
+                                               def_lse,           // lse
+                                               0,                 // page_block_size
+                                               false,             // use_cache_batch_idx
+                                               "n",               // bias_str
+                                               p_drop,            // p_drop
+                                               drop_seed,         // drop_seed
+                                               drop_offset,       // drop_offset
+                                               drop_prefs,        // drop_prefs
+                                               mask_str,          // mask_str
+                                               squant,
+                                               true, // is_rotary_interleaved
+                                               1,    // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+#if CK_TILE_FMHA_FWD_PAGEDKV_API
+
+class PagedKV : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                                bool,
+                                                bool,
+                                                mode_enum,
+                                                int,
+                                                std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PagedKV);
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaFwd,
+                         PagedKV,
+                         Combine(SplitKVHDimValues,
+                                 Bool(),            // layouts of k and v are controlled by i_perm
+                                 IsVRowmajorValues, // layout of v is controlled by is_v_rowmajor
+                                 ModeValues,
+                                 Values(128, 256),
+                                 Values(std::tuple{2, 3, 1, 200, 1024, "0"},
+                                        std::tuple{3, 2, -1, 128, 768, "2"},
+                                        std::tuple{2, 2, -1, 230, 899, "t:50,64"})));
+
+TEST_P(PagedKV, Test)
+{
+    auto [hdims, i_perm, is_v_rowmajor, mode, page_block_size, dims_mask] = GetParam();
+    auto [hdim_q, hdim_v]                                                 = hdims;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str]            = dims_mask;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {adjust_seqlen(seqlen_q)},
+                                               {adjust_seqlen(seqlen_k)},
+                                               hdim_q,
+                                               hdim_v,
+                                               0,               // seqlen_knew
+                                               {-1},            // seqlen_qpads
+                                               {-1},            // seqlen_kpads
+                                               {},              // q_eff_lens_per_batch
+                                               {},              // kv_eff_lens_per_batch
+                                               0,               // rotary_dim
+                                               i_perm,          // i_perm
+                                               false,           // o_perm
+                                               0,               // scale_s
+                                               0,               // logits_soft_cap
+                                               is_v_rowmajor,   // is_v_rowmajor
+                                               false,           // lse
+                                               page_block_size, // page_block_size
+                                               false,           // use_cache_batch_idx
+                                               "n",             // bias_str
+                                               0.0f,            // p_drop
+                                               0,               // drop_seed
+                                               0,               // drop_offset
+                                               false,           // drop_prefs
+                                               mask_str,        // mask_str
+                                               squant,
+                                               true, // is_rotary_interleaved
+                                               1,    // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+#endif // CK_TILE_FMHA_FWD_PAGEDKV_API
+
+#if CK_TILE_FMHA_FWD_SPLITKV_API
+
+class SplitKV : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                                bool,
+                                                bool,
+                                                std::tuple<mode_enum, bool>,
+                                                int,
+                                                std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SplitKV);
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaFwd,
+                         SplitKV,
+                         Combine(SplitKVHDimValues,
+                                 Bool(),            // layouts of k and v are controlled by i_perm
+                                 IsVRowmajorValues, // layout of v is controlled by is_v_rowmajor
+                                 Values(std::tuple{mode_enum::batch, false},
+                                        std::tuple{mode_enum::batch, true},
+                                        std::tuple{mode_enum::group, false}),
+                                 Values(3, 4),
+                                 Values(std::tuple{4, 3, 1, 200, 1024, "0"},
+                                        std::tuple{2, 2, -1, 512, 2000, "0"},
+                                        std::tuple{3, 2, -1, 230, 899, "t:128,128"})));
+
+TEST_P(SplitKV, Test)
+{
+    auto [hdims, i_perm, is_v_rowmajor, mode_use_cache_batch_idx, num_splits, dims_mask] =
+        GetParam();
+    auto [hdim_q, hdim_v]                                      = hdims;
+    auto [mode, use_cache_batch_idx]                           = mode_use_cache_batch_idx;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {adjust_seqlen(seqlen_q)},
+                                               {adjust_seqlen(seqlen_k)},
+                                               hdim_q,
+                                               hdim_v,
+                                               0,                   // seqlen_knew
+                                               {-1},                // seqlen_qpads
+                                               {-1},                // seqlen_kpads
+                                               {},                  // q_eff_lens_per_batch
+                                               {},                  // kv_eff_lens_per_batch
+                                               0,                   // rotary_dim
+                                               i_perm,              // i_perm
+                                               false,               // o_perm
+                                               0,                   // scale_s
+                                               0,                   // logits_soft_cap
+                                               is_v_rowmajor,       // is_v_rowmajor
+                                               def_lse,             // lse
+                                               0,                   // page_block_size
+                                               use_cache_batch_idx, // use_cache_batch_idx
+                                               "n",                 // bias_str
+                                               0.0f,                // p_drop
+                                               0,                   // drop_seed
+                                               0,                   // drop_offset
+                                               false,               // drop_prefs
+                                               mask_str,            // mask_str
+                                               squant,
+                                               true,       // is_rotary_interleaved
+                                               num_splits, // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+#endif // CK_TILE_FMHA_FWD_SPLITKV_API
+
+#if CK_TILE_FMHA_FWD_APPENDKV_API
+
+class AppendKV : public TestWithParam<std::tuple<std::tuple<int, int>,
+                                                 bool,
+                                                 bool,
+                                                 std::tuple<int, bool>,
+                                                 int,
+                                                 std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    TestCkTileFmhaFwd,
+    AppendKV,
+    Combine(AppendKVHDimValues,
+            Bool(),            // layouts of k and v are controlled by i_perm
+            IsVRowmajorValues, // layout of v is controlled by is_v_rowmajor
+            ValuesIn({std::tuple{0, true}, std::tuple{0, false}, std::tuple{128, false}}),
+            Values(1, 64, -1),
+            Values(std::tuple{3, 3, -1, 60, 129, "t:32,32"},
+                   std::tuple{3, 2, 2, 256, 256, "0"},
+                   std::tuple{2, 3, 1, 264, 265, "1"},
+                   std::tuple{4, 4, 2, 71, 64, "1"})));
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AppendKV);
+
+TEST_P(AppendKV, Test)
+{
+    auto [hdims,
+          i_perm,
+          is_v_rowmajor,
+          page_block_size_use_cache_batch_idx,
+          seqlen_knew,
+          dims_mask]                            = GetParam();
+    auto [hdim_q, hdim_v]                       = hdims;
+    auto [page_block_size, use_cache_batch_idx] = page_block_size_use_cache_batch_idx;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
+
+    seqlen_knew = seqlen_knew == -1 ? seqlen_k : seqlen_knew;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode_enum::batch,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {adjust_seqlen(seqlen_q)},
+                                               {adjust_seqlen(seqlen_k)},
+                                               hdim_q,
+                                               hdim_v,
+                                               seqlen_knew,         // seqlen_knew
+                                               {-1},                // seqlen_qpads
+                                               {-1},                // seqlen_kpads
+                                               {},                  // q_eff_lens_per_batch
+                                               {},                  // kv_eff_lens_per_batch
+                                               0,                   // rotary_dim
+                                               i_perm,              // i_perm
+                                               true,                // o_perm
+                                               0,                   // scale_s
+                                               0,                   // logits_soft_cap
+                                               is_v_rowmajor,       // is_v_rowmajor
+                                               def_lse,             // lse
+                                               page_block_size,     // page_block_size
+                                               use_cache_batch_idx, // use_cache_batch_idx
+                                               "n",                 // bias_str
+                                               0.0f,                // p_drop
+                                               0,                   // drop_seed
+                                               0,                   // drop_offset
+                                               false,               // drop_prefs
+                                               mask_str,            // mask_str
+                                               squant,
+                                               false, // is_rotary_interleaved
+                                               1,     // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+class AppendKVRoPE
+    : public TestWithParam<std::tuple<bool,
+                                      std::tuple<int, int>,
+                                      bool,
+                                      bool,
+                                      std::tuple<int, bool>,
+                                      int,
+                                      std::tuple<int, int, int, int, int, std::string>>>
+{
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AppendKVRoPE);
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaFwd,
+                         AppendKVRoPE,
+                         Combine(EnableTestIf(!std::is_same_v<DataTypeConfig, FmhaFwdFp8>),
+                                 AppendKVHDimValues,
+                                 Bool(),            // layouts of k and v are controlled by i_perm
+                                 IsVRowmajorValues, // layout of v is controlled by is_v_rowmajor
+                                 Values(std::tuple{0, false},
+                                        std::tuple{16, true},
+                                        std::tuple{32, false},
+                                        std::tuple{-1, true}),
+                                 Values(16, 50, -1),
+                                 Values(std::tuple{2, 3, -1, 60, 129, "t:32,32"},
+                                        std::tuple{1, 2, 1, 128, 55, "0"},
+                                        std::tuple{3, 4, 2, 72, 128, "1"})));
+
+TEST_P(AppendKVRoPE, Test)
+{
+    auto [_, hdims, i_perm, is_v_rowmajor, rotary, seqlen_knew, dims_mask] = GetParam();
+    auto [hdim_q, hdim_v]                                                  = hdims;
+    auto [rotary_dim, is_rotary_interleaved]                               = rotary;
+    auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str]             = dims_mask;
+
+    rotary_dim  = rotary_dim == -1 ? hdim_q : rotary_dim;
+    seqlen_knew = seqlen_knew == -1 ? seqlen_k : seqlen_knew;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode_enum::batch,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               {adjust_seqlen(seqlen_q)},
+                                               {adjust_seqlen(seqlen_k)},
+                                               hdim_q,
+                                               hdim_v,
+                                               seqlen_knew,   // seqlen_knew
+                                               {-1},          // seqlen_qpads
+                                               {-1},          // seqlen_kpads
+                                               {},            // q_eff_lens_per_batch
+                                               {},            // kv_eff_lens_per_batch
+                                               rotary_dim,    // rotary_dim
+                                               i_perm,        // i_perm
+                                               true,          // o_perm
+                                               0,             // scale_s
+                                               0,             // logits_soft_cap
+                                               is_v_rowmajor, // is_v_rowmajor
+                                               true,          // lse
+                                               0,             // page_block_size
+                                               false,         // use_cache_batch_idx
+                                               "n",           // bias_str
+                                               0.0f,          // p_drop
+                                               0,             // drop_seed
+                                               0,             // drop_offset
+                                               false,         // drop_prefs
+                                               mask_str,      // mask_str
+                                               squant,
+                                               is_rotary_interleaved, // is_rotary_interleaved
+                                               1,                     // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
+
+#endif // CK_TILE_FMHA_FWD_APPENDKV_API
+
+// ---------------------------------------------------------------
+// Parameterized padding tests (batch & group) using Combine+Values
+// ---------------------------------------------------------------
+
+using PaddingParam = std::tuple<mode_enum,        // mode
+                                int,              // batch
+                                int,              // nhead
+                                int,              // nhead_k
+                                std::vector<int>, // seqlen_qs (logical)
+                                std::vector<int>, // seqlen_ks (logical)
+                                std::vector<int>, // seqlen_qpads (physical padded lengths)
+                                std::vector<int>, // seqlen_kpads (physical padded lengths)
+                                std::vector<int>, // q_eff_lens
+                                std::vector<int>, // kv_eff_lens
+                                bool,             // i_perm
+                                bool,             // o_perm
+                                std::string>;     // mask_str
+
+// Ensure headers for containers / algorithms used in padding param builder.
+#include <vector>
+#include <array>
+#include <cmath>
+#include <algorithm>
+
+class PaddingCases : public TestWithParam<PaddingParam>
+{
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PaddingCases);
+
+// Build padding test params programmatically to enforce constraints
+static std::vector<PaddingParam> BuildPaddingParams()
+{
+    std::vector<PaddingParam> params;
+
+    // mask variants to cover
+    const std::vector<std::string> mask_variants{"0", "t:50,64", "b:32,40"};
+    const std::vector<std::string> mask_variants_reduced{"0", "t:50,64"}; // used for trimmed sets
+
+    // Representative ratio pairs (q_ratio, k_ratio) to avoid explosion
+    const std::vector<std::pair<double, double>> ratio_pairs_full{
+        {1.0, 1.0}, // both full
+        {1.0, 0.5}, // q full, k half
+        {0.5, 1.0}, // q half, k full
+    };
+    const std::vector<std::pair<double, double>> ratio_pairs_reduced{{1.0, 1.0}, {0.5, 1.0}};
+
+    // candidate physical seqlens for batch mode (single value) & for group mode (per batch)
+    const std::vector<int> physical_lengths_full{64, 128, 256};
+    const std::vector<int> physical_lengths_reduced{64};
+
+    // batch sizes to sample
+    const std::vector<int> batch_sizes{1, 4};
+    // --------------------------------------------------------------------
+    // Head configuration space (cover MHA, GQA, MQA)
+    //  - Standard MHA: nhead_k == -1 (treated internally as nhead)
+    //  - GQA: nhead_k > 0 and nhead % nhead_k == 0, nhead_k < nhead
+    //  - MQA: nhead_k == 1
+    // We choose (9, -1), (9, 3), (9, 1) so that divisibility holds. Full
+    // combinatorics only applied to the first (standard) configuration to
+    // avoid test explosion.
+    // --------------------------------------------------------------------
+    struct HeadCfg
+    {
+        int nhead;
+        int nhead_k; // -1 for standard; else must divide nhead
+        bool full;   // whether to use full coverage sets
+    };
+    const std::vector<HeadCfg> head_cfgs = {
+        {9, -1, true}, // MHA full
+        {9, 3, false}, // GQA reduced (nhead/nhead_k=3)
+        {9, 1, false}  // MQA reduced
+    };
+
+    // Helper to clamp and ensure >=1
+    auto logical_len = [](int physical, double ratio) {
+        int v = static_cast<int>(std::round(physical * ratio));
+        v     = std::max(1, std::min(v, physical));
+        return v;
+    };
+    // Iterate over head configurations
+    for(const auto& hc : head_cfgs)
+    {
+        const auto& ratio_pairs        = hc.full ? ratio_pairs_full : ratio_pairs_reduced;
+        const auto& phys_lengths_batch = hc.full ? physical_lengths_full : physical_lengths_reduced;
+        const auto& phys_lengths_group_q = phys_lengths_batch; // reuse
+        const auto& phys_lengths_group_k = phys_lengths_batch; // reuse
+        const auto& masks                = hc.full ? mask_variants : mask_variants_reduced;
+
+        // -----------------
+        // Batch mode params (effective lengths only)
+        // -----------------
+        for(int b : batch_sizes)
+        {
+            for(int phys_qkv : phys_lengths_batch)
+            {
+                for(const auto& rkpair : ratio_pairs)
+                {
+                    double rq = rkpair.first;
+                    double rk = rkpair.second;
+                    std::vector<int> q_eff(b), kv_eff(b);
+                    int log_q = logical_len(phys_qkv, rq);
+                    int log_k = logical_len(phys_qkv, rk);
+                    for(int i = 0; i < b; ++i)
+                    {
+                        q_eff[i]  = log_q;
+                        kv_eff[i] = log_k;
+                    }
+                    for(const auto& mask : masks)
+                    {
+                        params.emplace_back(PaddingParam{mode_enum::batch,
+                                                         b,
+                                                         hc.nhead,
+                                                         hc.nhead_k,
+                                                         {phys_qkv}, // seqlen_qs
+                                                         {phys_qkv}, // seqlen_ks
+                                                         {},         // seqlen_qpads
+                                                         {},         // seqlen_kpads
+                                                         q_eff,
+                                                         kv_eff,
+                                                         true,
+                                                         true,
+                                                         mask});
+                    }
+                }
+                // Single-token logical length case (both q & k = 1)
+                for(const auto& mask : masks)
+                {
+                    std::vector<int> q_eff(b, 1), kv_eff(b, 1);
+                    params.emplace_back(PaddingParam{mode_enum::batch,
+                                                     b,
+                                                     hc.nhead,
+                                                     hc.nhead_k,
+                                                     {phys_qkv},
+                                                     {phys_qkv},
+                                                     {},
+                                                     {},
+                                                     q_eff,
+                                                     kv_eff,
+                                                     true,
+                                                     true,
+                                                     mask});
+                }
+            }
+        }
+
+        // -----------------
+        // Group mode params (physical padding + logical variants)
+        // -----------------
+        for(int b : batch_sizes)
+        {
+            for(int phys_q : phys_lengths_group_q)
+            {
+                for(int phys_k : phys_lengths_group_k)
+                {
+                    for(const auto& rkpair : ratio_pairs)
+                    {
+                        double rq = rkpair.first;
+                        double rk = rkpair.second;
+                        std::vector<int> seqlen_qs(b), seqlen_ks(b), seqlen_qpads(b),
+                            seqlen_kpads(b);
+                        for(int i = 0; i < b; ++i)
+                        {
+                            seqlen_qpads[i] = phys_q;
+                            seqlen_kpads[i] = phys_k;
+                            seqlen_qs[i]    = logical_len(phys_q, rq);
+                            seqlen_ks[i]    = logical_len(phys_k, rk);
+                        }
+                        std::array<std::pair<std::vector<int>, std::vector<int>>, 3> pad_variants{
+                            std::pair{seqlen_qpads, seqlen_kpads}, // both
+                            std::pair{seqlen_qpads, seqlen_ks},    // only q padding
+                            std::pair{seqlen_qs, seqlen_kpads}     // only kv padding
+                        };
+                        for(const auto& mask : masks)
+                        {
+                            for(const auto& pv : pad_variants)
+                            {
+                                params.emplace_back(PaddingParam{mode_enum::group,
+                                                                 b,
+                                                                 hc.nhead,
+                                                                 hc.nhead_k,
+                                                                 seqlen_qs,
+                                                                 seqlen_ks,
+                                                                 pv.first,
+                                                                 pv.second,
+                                                                 {},
+                                                                 {},
+                                                                 true,
+                                                                 true,
+                                                                 mask});
+                            }
+                        }
+                    }
+                    // Single-token logical length case
+                    for(const auto& mask : masks)
+                    {
+                        std::vector<int> seqlen_qs(b, 1), seqlen_ks(b, 1);
+                        std::vector<int> seqlen_qpads(b, phys_q), seqlen_kpads(b, phys_k);
+                        // both padding variant only (others degenerate)
+                        params.emplace_back(PaddingParam{mode_enum::group,
+                                                         b,
+                                                         hc.nhead,
+                                                         hc.nhead_k,
+                                                         seqlen_qs,
+                                                         seqlen_ks,
+                                                         seqlen_qpads,
+                                                         seqlen_kpads,
+                                                         {},
+                                                         {},
+                                                         true,
+                                                         true,
+                                                         mask});
+                    }
+                }
+            }
+        }
+    }
+
+    return params;
+}
+
+static const std::vector<PaddingParam> kPaddingParams = BuildPaddingParams();
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaFwd_Padding, PaddingCases, ValuesIn(kPaddingParams));
+
+TEST_P(PaddingCases, Test)
+{
+    if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdFp8>)
+    {
+        GTEST_SKIP() << "Skip for fp8";
+    }
+
+    auto [mode,
+          batch,
+          nhead,
+          nhead_k,
+          seqlen_qs,
+          seqlen_ks,
+          seqlen_qpads,
+          seqlen_kpads,
+          q_eff_lens,
+          kv_eff_lens,
+          i_perm,
+          o_perm,
+          mask_str] = GetParam();
+
+    // For batch mode we wrap single logical lengths with adjust_seqlen.
+    std::vector<int> adj_qs =
+        (mode == mode_enum::batch) ? std::vector<int>{adjust_seqlen(seqlen_qs.at(0))} : seqlen_qs;
+    std::vector<int> adj_ks =
+        (mode == mode_enum::batch) ? std::vector<int>{adjust_seqlen(seqlen_ks.at(0))} : seqlen_ks;
+
+    const int hdim_q      = 64;
+    const int hdim_v      = 64;
+    const int seqlen_knew = 0;
+
+    auto result = fmha_fwd_run<DataTypeConfig>(mode,
+                                               batch,
+                                               nhead,
+                                               nhead_k,
+                                               adj_qs,
+                                               adj_ks,
+                                               hdim_q,
+                                               hdim_v,
+                                               seqlen_knew,  // seqlen_knew
+                                               seqlen_qpads, // seqlen_qpads
+                                               seqlen_kpads, // seqlen_kpads
+                                               q_eff_lens,   // q_eff_lens_per_batch
+                                               kv_eff_lens,  // kv_eff_lens_per_batch
+                                               0,            // rotary_dim
+                                               i_perm,       // i_perm
+                                               o_perm,       // o_perm
+                                               0,            // scale_s
+                                               0,            // logits_soft_cap
+                                               def_is_v_rowmajor,
+                                               def_lse,  // lse
+                                               0,        // page_block_size
+                                               false,    // use_cache_batch_idx
+                                               "n",      // bias_str
+                                               0.0f,     // p_drop
+                                               0,        // drop_seed
+                                               0,        // drop_offset
+                                               false,    // drop_prefs
+                                               mask_str, // mask_str
+                                               squant,
+                                               true, // is_rotary_interleaved
+                                               1,    // num_splits
+                                               COMMON_ARGS);
+    CHECK_RESULT(result);
+}
diff --git a/test/ck_tile/fmha/test_fmha_fwd_bf16.cpp b/test/ck_tile/fmha/test_fmha_fwd_bf16.cpp
new file mode 100644
index 0000000000..fbc6449a6a
--- /dev/null
+++ b/test/ck_tile/fmha/test_fmha_fwd_bf16.cpp
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "example/ck_tile/01_fmha/fmha_fwd.hpp"
+#include "example/ck_tile/01_fmha/fmha_fwd_runner.hpp"
+
+#include "gtest/gtest.h"
+
+#include <tuple>
+#include <string>
+
+using ::testing::Values;
+
+using DataTypeConfig = FmhaFwdBf16;
+
+const auto HDimValues = Values(std::tuple{32, -1},
+                               std::tuple{64, -1},
+                               std::tuple{96, 128},
+                               std::tuple{128, -1},
+                               std::tuple{192, 128},
+                               std::tuple{192, -1},
+                               std::tuple{256, -1});
+
+const auto SplitKVHDimValues = Values(std::tuple{32, -1},
+                                      std::tuple{64, -1},
+                                      std::tuple{96, -1},
+                                      std::tuple{128, -1},
+                                      std::tuple{256, -1});
+
+const auto AppendKVHDimValues =
+    Values(std::tuple{32, -1}, std::tuple{64, -1}, std::tuple{128, -1}, std::tuple{256, -1});
+
+const auto ModeValues = Values(mode_enum::batch, mode_enum::group);
+
+const auto IsVRowmajorValues = Values(false, true);
+
+const bool squant             = false;
+const std::string init_method = "uf";
+const bool def_lse            = true;
+const bool def_is_v_rowmajor  = true;
+
+int adjust_seqlen(int seqlen) { return seqlen; }
+
+#include "test_fmha_fwd.inc"
diff --git a/test/ck_tile/fmha/test_fmha_fwd_fp16.cpp b/test/ck_tile/fmha/test_fmha_fwd_fp16.cpp
new file mode 100644
index 0000000000..abc2c44726
--- /dev/null
+++ b/test/ck_tile/fmha/test_fmha_fwd_fp16.cpp
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "example/ck_tile/01_fmha/fmha_fwd.hpp"
+#include "example/ck_tile/01_fmha/fmha_fwd_runner.hpp"
+
+#include "gtest/gtest.h"
+
+#include <tuple>
+#include <string>
+
+using ::testing::Values;
+
+using DataTypeConfig = FmhaFwdFp16;
+
+const auto HDimValues = Values(std::tuple{32, -1},
+                               std::tuple{64, -1},
+                               std::tuple{96, 128},
+                               std::tuple{128, -1},
+                               std::tuple{192, 128},
+                               std::tuple{192, -1},
+                               std::tuple{256, -1});
+
+const auto SplitKVHDimValues = Values(std::tuple{32, -1},
+                                      std::tuple{64, -1},
+                                      std::tuple{96, -1},
+                                      std::tuple{128, -1},
+                                      std::tuple{256, -1});
+
+const auto AppendKVHDimValues =
+    Values(std::tuple{32, -1}, std::tuple{64, -1}, std::tuple{128, -1}, std::tuple{256, -1});
+
+const auto ModeValues = Values(mode_enum::batch, mode_enum::group);
+
+const auto IsVRowmajorValues = Values(false, true);
+
+const bool squant             = false;
+const std::string init_method = "uf";
+const bool def_lse            = true;
+const bool def_is_v_rowmajor  = true;
+
+int adjust_seqlen(int seqlen) { return seqlen; }
+
+#include "test_fmha_fwd.inc"
diff --git a/test/ck_tile/fmha/test_fmha_fwd_fp32.cpp b/test/ck_tile/fmha/test_fmha_fwd_fp32.cpp
new file mode 100644
index 0000000000..00f1eb0629
--- /dev/null
+++ b/test/ck_tile/fmha/test_fmha_fwd_fp32.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "example/ck_tile/01_fmha/fmha_fwd.hpp"
+#include "example/ck_tile/01_fmha/fmha_fwd_runner.hpp"
+
+#include "gtest/gtest.h"
+
+#include <tuple>
+#include <string>
+
+using ::testing::Values;
+
+using DataTypeConfig = FmhaFwdFp32;
+
+const auto HDimValues = Values(std::tuple{32, -1},
+                               std::tuple{48, -1},
+                               std::tuple{64, -1},
+                               std::tuple{96, 128},
+                               std::tuple{128, -1},
+                               std::tuple{192, -1},
+                               std::tuple{256, -1});
+
+const auto SplitKVHDimValues = Values();
+
+const auto AppendKVHDimValues = Values();
+
+const auto ModeValues = Values(mode_enum::batch, mode_enum::group);
+
+const auto IsVRowmajorValues = Values(true);
+
+const bool squant             = false;
+const std::string init_method = "uf";
+const bool def_lse            = true;
+const bool def_is_v_rowmajor  = true;
+
+int adjust_seqlen(int seqlen) { return seqlen; }
+
+#include "test_fmha_fwd.inc"
diff --git a/test/ck_tile/fmha/test_fmha_fwd_fp8.cpp b/test/ck_tile/fmha/test_fmha_fwd_fp8.cpp
new file mode 100644
index 0000000000..b99c304d1f
--- /dev/null
+++ b/test/ck_tile/fmha/test_fmha_fwd_fp8.cpp
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "example/ck_tile/01_fmha/fmha_fwd.hpp"
+#include "example/ck_tile/01_fmha/fmha_fwd_runner.hpp"
+
+#include "gtest/gtest.h"
+
+#include <tuple>
+#include <string>
+
+using ::testing::Values;
+
+using DataTypeConfig = FmhaFwdFp8;
+
+// Currently there are no fp8 instances for splitkv, pagedkv by default (the tests pass if such
+// instances are added), however the corresponding tests are not disabled (they will be skipped)
+// in case such instances will be added in the future.
+
+const auto HDimValues = Values(std::tuple{64, -1}, std::tuple{128, -1});
+
+const auto SplitKVHDimValues = Values(std::tuple{64, -1}, std::tuple{128, -1});
+
+const auto AppendKVHDimValues = Values(std::tuple{64, -1}, std::tuple{128, -1});
+
+// There are no fp8 instances with seqlen padding (mode_enum::group requires it)
+const auto ModeValues = Values(mode_enum::batch);
+
+const auto IsVRowmajorValues = Values(false);
+
+const auto squant             = true;
+const std::string init_method = "uf";
+const bool def_lse            = false;
+const bool def_is_v_rowmajor  = true;
+
+int adjust_seqlen(int seqlen)
+{
+    // There are no fp8 instances with padding, pad seqlen to avoid skipping most of the tests
+    return ck_tile::integer_least_multiple(seqlen, 128);
+}
+
+#include "test_fmha_fwd.inc"
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index 5d34943e0d..44e2433060 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -12,16 +12,16 @@ list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
     -enable-noalias-to-md-conversion=0
 )
 
-if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_mem test_gemm_pipeline_mem.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_compv3 test_gemm_pipeline_compv3.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_compv4 test_gemm_pipeline_compv4.cpp)
-
-    target_compile_options(test_ck_tile_gemm_pipeline_mem PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_compv3 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_compv4 PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
-
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12")
+    add_test_executable(test_ck_tile_gemm_pipeline_universal_int8 test_gemm_pipeline_universal_int8.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_universal_int8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    add_test_executable(test_ck_tile_gemm_pipeline_universal_pk_int4 test_gemm_pipeline_universal_pk_int4.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_universal_pk_int4 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+else()
+    message(DEBUG "Skipping ck_tile_gemm tests for current target")
+endif()
 
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     add_test_executable(test_ck_tile_gemm_pipeline_universal_fp8 test_gemm_pipeline_universal_fp8.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_universal_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     add_test_executable(test_ck_tile_gemm_pipeline_universal_bf8 test_gemm_pipeline_universal_bf8.cpp)
@@ -30,37 +30,47 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
     target_compile_options(test_ck_tile_gemm_pipeline_basic_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     add_test_executable(test_ck_tile_gemm_pipeline_basic_bf8 test_gemm_pipeline_basic_bf8.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_basic_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    
-    add_test_executable(test_ck_tile_gemm_pipeline_universal_int8 test_gemm_pipeline_universal_int8.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_int8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    add_test_executable(test_ck_tile_gemm_pipeline_universal_pk_int4 test_gemm_pipeline_universal_pk_int4.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_pk_int4 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    
-elseif(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
-    # On Radeon devices, build the WMMA version instead
-    add_gtest_executable(test_ck_tile_gemm_pipeline_mem_wmma test_gemm_pipeline_mem_wmma.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_compv3_wmma test_gemm_pipeline_compv3_wmma.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_compv4_wmma test_gemm_pipeline_compv4_wmma.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_mem_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_compv3_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_compv4_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
 else()
     message(DEBUG "Skipping ck_tile_gemm tests for current target")
 endif()
 
-if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95" OR GPU_TARGETS MATCHES "gfx90a")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_persistent test_gemm_pipeline_persistent.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_persistent PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
     add_test_executable(test_ck_tile_gemm_pipeline_universal_fp16 test_gemm_pipeline_universal_fp16.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_universal_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_options(test_ck_tile_gemm_pipeline_universal_fp16 PRIVATE --save-temps -Wno-gnu-line-marker)
     add_test_executable(test_ck_tile_gemm_pipeline_universal_bf16 test_gemm_pipeline_universal_bf16.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_universal_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     add_test_executable(test_ck_tile_gemm_pipeline_basic_fp16 test_gemm_pipeline_basic_fp16.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_basic_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     add_test_executable(test_ck_tile_gemm_pipeline_basic_bf16 test_gemm_pipeline_basic_bf16.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_basic_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-elseif(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_persistent_wmma test_gemm_pipeline_persistent_wmma.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_persistent_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+else()
+    message(DEBUG "Skipping ck_tile_gemm tests for current target ")
+endif()
+
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
+    if(GPU_TARGETS MATCHES "gfx94|gfx95")
+        add_gtest_executable(test_ck_tile_gemm_pipeline_mem test_gemm_pipeline_mem.cpp)
+        add_gtest_executable(test_ck_tile_gemm_pipeline_compv3 test_gemm_pipeline_compv3.cpp)
+        add_gtest_executable(test_ck_tile_gemm_pipeline_compv4 test_gemm_pipeline_compv4.cpp)
+        add_gtest_executable(test_ck_tile_gemm_pipeline_persistent test_gemm_pipeline_persistent.cpp)
+        target_compile_options(test_ck_tile_gemm_pipeline_mem PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+        target_compile_options(test_ck_tile_gemm_pipeline_compv3 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+        target_compile_options(test_ck_tile_gemm_pipeline_compv4 PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
+        target_compile_options(test_ck_tile_gemm_pipeline_persistent PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    endif()
+
+    if(GPU_TARGETS MATCHES "gfx11|gfx12")
+    # On Radeon devices, build the WMMA version instead
+        add_gtest_executable(test_ck_tile_gemm_pipeline_mem_wmma test_gemm_pipeline_mem_wmma.cpp)
+        add_gtest_executable(test_ck_tile_gemm_pipeline_compv3_wmma test_gemm_pipeline_compv3_wmma.cpp)
+        add_gtest_executable(test_ck_tile_gemm_pipeline_compv4_wmma test_gemm_pipeline_compv4_wmma.cpp)
+        add_gtest_executable(test_ck_tile_gemm_pipeline_persistent_wmma test_gemm_pipeline_persistent_wmma.cpp)
+        target_compile_options(test_ck_tile_gemm_pipeline_mem_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+        target_compile_options(test_ck_tile_gemm_pipeline_compv3_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+        target_compile_options(test_ck_tile_gemm_pipeline_compv4_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
+        target_compile_options(test_ck_tile_gemm_pipeline_persistent_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    endif()
+else()
+    message(DEBUG "Skipping ck_tile_gemm tests for current target test_ck_tile_gemm_pipeline") 
 endif()
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
index 4e3033782c..23548f2f92 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
@@ -2,4 +2,11 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "test_gemm_pipeline_basic_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::bf16_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success      = run_gemm_combinations<ck_tile::bf16_t>() && is_success;
+    is_success =
+        run_gemm_combinations<ck_tile::bf16_t, ck_tile::pk_int4_t, ck_tile::bf16_t>() && is_success;
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
index 61614fc6f5..cbf25a223a 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
@@ -2,4 +2,12 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "test_gemm_pipeline_basic_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success =
+        run_gemm_combinations<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>() && is_success;
+    is_success =
+        run_gemm_combinations<ck_tile::bf8_t, ck_tile::pk_int4_t, ck_tile::half_t>() && is_success;
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
index c667c08053..7afeb4140d 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
@@ -2,4 +2,13 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "test_gemm_pipeline_basic_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::half_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success      = run_gemm_combinations<ck_tile::half_t>() && is_success;
+#if 0
+    is_success =
+        run_gemm_combinations<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>() && is_success;
+#endif
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
index 9a3498b7ea..0ba4b54403 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
@@ -2,4 +2,12 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "test_gemm_pipeline_basic_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success =
+        run_gemm_combinations<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>() && is_success;
+    is_success =
+        run_gemm_combinations<ck_tile::fp8_t, ck_tile::pk_int4_t, ck_tile::half_t>() && is_success;
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
index 1fdf26f01c..2c8a776f10 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
@@ -13,6 +13,28 @@
 #include "test_gemm_pipeline_smoke_util.hpp"
 #include "test_gemm_pipeline_smoke_run_test.inc"
 
+struct GemmConfig_Mfma : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+};
+
+struct GemmConfig_Wmma : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+};
+
 template <typename GemmConfig,
           typename ADataType,
           typename BDataType,
@@ -38,17 +60,17 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     constexpr int kBlockPerCu = 1;
 
     // This part comes from the Codegen
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
+    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
+    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
+    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
 
     constexpr ck_tile::index_t M_Warp = 2;
     constexpr ck_tile::index_t N_Warp = 2;
     constexpr ck_tile::index_t K_Warp = 1;
 
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
+    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
+    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
+    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
 
     using CodegenGemmShape =
         ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
@@ -130,7 +152,10 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     }
 }
 
-template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
 bool run_gemm_test_prec_type(std::string a_layout,
                              std::string b_layout,
                              ck_tile::ArgParser& arg_parser)
@@ -142,12 +167,12 @@ bool run_gemm_test_prec_type(std::string a_layout,
     {
         if(a_layout == "R" && b_layout == "C")
         {
-            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
-            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 arg_parser, Col{}, Col{}, Row{});
         }
         else
@@ -160,22 +185,22 @@ bool run_gemm_test_prec_type(std::string a_layout,
     {
         if(a_layout == "R" && b_layout == "C")
         {
-            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "R" && b_layout == "R")
         {
-            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 arg_parser, Row{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "R")
         {
-            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 arg_parser, Col{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
-            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 arg_parser, Col{}, Col{}, Row{});
         }
         else
@@ -185,7 +210,7 @@ bool run_gemm_test_prec_type(std::string a_layout,
     }
 }
 
-template <typename APrecType, typename BPrecType, typename CPrecType>
+template <typename GemmConfig, typename APrecType, typename BPrecType, typename CPrecType>
 bool run_gemm_test(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -195,11 +220,12 @@ bool run_gemm_test(int argc, char* argv[])
     std::string a_layout = arg_parser.get_str("a_layout");
     std::string b_layout = arg_parser.get_str("b_layout");
 
-    return run_gemm_test_prec_type<APrecType, BPrecType, CPrecType>(a_layout, b_layout, arg_parser);
+    return run_gemm_test_prec_type<GemmConfig, APrecType, BPrecType, CPrecType>(
+        a_layout, b_layout, arg_parser);
 }
 
 template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
-int run_gemm_combinations()
+bool run_gemm_combinations()
 {
     // Define possible values for each parameter
     std::vector<std::string> m_values = {"128", "1024"};
@@ -255,8 +281,15 @@ int run_gemm_combinations()
                 // Call the function with the current configuration
                 try
                 {
-                    is_success = run_gemm_test<APrecType, BPrecType, CPrecType>(ARG_COUNT, argv) &&
+#if CK_TILE_USE_WMMA
+                    is_success = run_gemm_test<GemmConfig_Wmma, APrecType, BPrecType, CPrecType>(
+                                     ARG_COUNT, argv) &&
                                  is_success;
+#else
+                    is_success = run_gemm_test<GemmConfig_Mfma, APrecType, BPrecType, CPrecType>(
+                                     ARG_COUNT, argv) &&
+                                 is_success;
+#endif
                 }
                 catch(const ArgumentsNotSupportedException& e)
                 {
@@ -271,5 +304,5 @@ int run_gemm_combinations()
             }
         }
     }
-    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+    return is_success;
 }
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
index 54410acf70..b3d433c466 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "test_gemm_pipeline_kernel_types.hpp"
 #include "test_gemm_pipeline_util.hpp"
 #include "gtest/gtest.h"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
index ab74e4e7b1..57feefceab 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
@@ -2,6 +2,8 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 
+#include "ck_tile/host/permute_pk_int4.hpp"
+
 template <typename Layout>
 static constexpr inline auto is_row_major(Layout layout_)
 {
@@ -91,61 +93,6 @@ void permute_tensor_b(Tensor& tensor)
     }
 }
 
-template <typename Tensor>
-void permute_vectors_i4x4_b(Tensor& tensor)
-{
-    const ck_tile::index_t K = tensor.get_length(0);
-    const ck_tile::index_t N = tensor.get_length(1);
-    // vector pk_i4x4 permute
-    for(int i = 0; i < N; i++)
-    {
-        for(int j = 0; j < K; j += 8)
-        {
-            int8_t input[8];
-
-            for(int k = 0; k < 4; k++)
-            {
-                int8_t i4x2      = tensor(j + k * 2, i).data;
-                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
-                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
-            }
-
-            // permute 01234567->20643175
-            {
-                int8_t hi   = input[2];
-                int8_t lo   = input[0];
-                int8_t i4x2 = (hi << 4) | lo;
-
-                tensor(j + 0, i) = i4x2;
-            }
-
-            {
-                int8_t hi   = input[6];
-                int8_t lo   = input[4];
-                int8_t i4x2 = (hi << 4) | lo;
-
-                tensor(j + 2, i) = i4x2;
-            }
-
-            {
-                int8_t hi   = input[3];
-                int8_t lo   = input[1];
-                int8_t i4x2 = (hi << 4) | lo;
-
-                tensor(j + 4, i) = i4x2;
-            }
-
-            {
-                int8_t hi   = input[7];
-                int8_t lo   = input[5];
-                int8_t i4x2 = (hi << 4) | lo;
-
-                tensor(j + 6, i) = i4x2;
-            }
-        }
-    }
-}
-
 template <typename GemmConfig,
           typename ADataType,
           typename BDataType,
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
index fc1ba149e3..821acd02ea 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
@@ -224,6 +224,27 @@ struct GemmConfigComputeV5 : public GemmConfigBase
     static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
 };
 
+template <typename PrecType>
+struct GemmConfigComputeV3_WMMA : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
 template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
 struct GemmTypeConfig;
 
@@ -246,6 +267,15 @@ struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
     using CDataType   = ck_tile::bf16_t;
 };
 
+template <>
+struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::pk_int4_t, ck_tile::bf16_t>
+{
+    using ADataType   = ck_tile::bf16_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::bf16_t;
+};
+
 template <>
 struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
 {
@@ -264,6 +294,15 @@ struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
     using CDataType   = ck_tile::half_t;
 };
 
+template <>
+struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
 template <>
 struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
 {
@@ -273,6 +312,15 @@ struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
     using CDataType   = ck_tile::half_t;
 };
 
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
 template <>
 struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
 {
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
index 1336f6fd70..cf8cbd69c5 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
@@ -6,4 +6,11 @@
 #include "test_gemm_pipeline_smoke_run_test.inc"
 #include "test_gemm_pipeline_universal_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::bf16_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success      = run_gemm_combinations<ck_tile::bf16_t>() && is_success;
+    is_success =
+        run_gemm_combinations<ck_tile::bf16_t, ck_tile::pk_int4_t, ck_tile::bf16_t>() && is_success;
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
index 5d55f34b84..90f539f176 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
@@ -6,4 +6,12 @@
 #include "test_gemm_pipeline_smoke_run_test.inc"
 #include "test_gemm_pipeline_universal_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success =
+        run_gemm_combinations<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>() && is_success;
+    is_success =
+        run_gemm_combinations<ck_tile::bf8_t, ck_tile::pk_int4_t, ck_tile::half_t>() && is_success;
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
index 0cebbcc721..727d43282a 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
@@ -6,4 +6,11 @@
 #include "test_gemm_pipeline_smoke_run_test.inc"
 #include "test_gemm_pipeline_universal_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::half_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success      = run_gemm_combinations<ck_tile::half_t>() && is_success;
+    is_success =
+        run_gemm_combinations<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>() && is_success;
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
index 29fb5f87ce..8fbbec8e9f 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
@@ -6,4 +6,12 @@
 #include "test_gemm_pipeline_smoke_run_test.inc"
 #include "test_gemm_pipeline_universal_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success =
+        run_gemm_combinations<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>() && is_success;
+    is_success =
+        run_gemm_combinations<ck_tile::fp8_t, ck_tile::pk_int4_t, ck_tile::half_t>() && is_success;
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp
index e8a089d8ff..991f84788f 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp
@@ -1,16 +1,15 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <cstddef>
-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <string>
-
 #include "ck_tile/host.hpp"
 #include "test_gemm_pipeline_smoke_util.hpp"
 #include "test_gemm_pipeline_smoke_run_test.inc"
 #include "test_gemm_pipeline_universal_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success =
+        run_gemm_combinations<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t>() && is_success;
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp
index 043db10fb0..8abf05dbcf 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp
@@ -1,16 +1,15 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <cstddef>
-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <string>
-
 #include "ck_tile/host.hpp"
 #include "test_gemm_pipeline_smoke_util.hpp"
 #include "test_gemm_pipeline_smoke_run_test.inc"
 #include "test_gemm_pipeline_universal_run_test.inc"
 
-int main() { return run_gemm_combinations<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(); }
+int main()
+{
+    bool is_success = true;
+    is_success =
+        run_gemm_combinations<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>() && is_success;
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
index aa6ab1bffa..7a1314cd78 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
@@ -328,6 +328,13 @@ int run_gemm_combinations()
                 // Call the function with the current configuration
                 try
                 {
+#if CK_TILE_USE_WMMA
+                    is_success = run_gemm_test<GemmConfigComputeV3_WMMA<CPrecType>,
+                                               APrecType,
+                                               BPrecType,
+                                               CPrecType>(ARG_COUNT, argv) &&
+                                 is_success;
+#else
                     is_success = run_gemm_test<GemmConfigComputeV3<CPrecType>,
                                                APrecType,
                                                BPrecType,
@@ -338,6 +345,7 @@ int run_gemm_combinations()
                                                BPrecType,
                                                CPrecType>(ARG_COUNT, argv) &&
                                  is_success;
+#endif
                 }
                 catch(const ArgumentsNotSupportedException& e)
                 {
@@ -352,5 +360,5 @@ int run_gemm_combinations()
             }
         }
     }
-    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+    return is_success;
 }
diff --git a/test/ck_tile/gemm_block_scale/CMakeLists.txt b/test/ck_tile/gemm_block_scale/CMakeLists.txt
index 847ab88644..93a13ba5af 100644
--- a/test/ck_tile/gemm_block_scale/CMakeLists.txt
+++ b/test/ck_tile/gemm_block_scale/CMakeLists.txt
@@ -6,14 +6,9 @@ endif()
 list(APPEND TEST_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
 
 if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
-    set(TEST_GEMM_NAME test_tile_gemm_aquant_basic)
-    set(QUANT_TYPES fp8 bf8 i4fp8 i4bf8 i4f32fp8 i4f32bf8)
-
-    foreach(QUANT_TYPE ${QUANT_TYPES})
-        add_gtest_executable(${TEST_GEMM_NAME}_${QUANT_TYPE} test_gemm_aquant_basic_${QUANT_TYPE}.cpp)
-        target_compile_options(${TEST_GEMM_NAME}_${QUANT_TYPE} PRIVATE ${TEST_GEMM_COMPILE_OPTIONS})
-    endforeach()
-
+    # Typed Test Suite for GEMM Quantization
+    add_gtest_executable(test_tile_gemm_quant_typed test_gemm_quant_typed.cpp)
+    target_compile_options(test_tile_gemm_quant_typed PRIVATE ${TEST_GEMM_COMPILE_OPTIONS})
 else()
     message(DEBUG "Skipping ck_tile quant gemm tests for current target")
 endif()
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_bf8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_bf8.cpp
deleted file mode 100644
index 9c4277d879..0000000000
--- a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_bf8.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "test_run_gemm_aquant_example.inc"
-
-int main() { return run_gemm_combinations("bf8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_fp8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_fp8.cpp
deleted file mode 100644
index b0cf55be6f..0000000000
--- a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_fp8.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "test_run_gemm_aquant_example.inc"
-
-int main() { return run_gemm_combinations("fp8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4bf8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4bf8.cpp
deleted file mode 100644
index fd80bf2b06..0000000000
--- a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4bf8.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "test_run_gemm_aquant_example.inc"
-
-int main() { return run_gemm_combinations("i4bf8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32bf8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32bf8.cpp
deleted file mode 100644
index fe8c9c5000..0000000000
--- a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32bf8.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "test_run_gemm_aquant_example.inc"
-
-int main() { return run_gemm_combinations("i4f32bf8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32fp8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32fp8.cpp
deleted file mode 100644
index a319d9c2ad..0000000000
--- a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32fp8.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "test_run_gemm_aquant_example.inc"
-
-int main() { return run_gemm_combinations("i4f32fp8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4fp8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4fp8.cpp
deleted file mode 100644
index ceb8760435..0000000000
--- a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4fp8.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "test_run_gemm_aquant_example.inc"
-
-int main() { return run_gemm_combinations("i4fp8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_utils.hpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_utils.hpp
deleted file mode 100644
index cf9bf18c5a..0000000000
--- a/test/ck_tile/gemm_block_scale/test_gemm_aquant_utils.hpp
+++ /dev/null
@@ -1,243 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <string>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/host/kernel_launch.hpp"
-#include "ck_tile/ops/epilogue.hpp"
-#include "ck_tile/ops/gemm.hpp"
-#include "ck_tile/ops/gemm_group_quant.hpp"
-
-#define CK_TILE_PIPELINE_PREFILL 1
-#define CK_TILE_PIPELINE_DECODE 2
-#define CK_TILE_PIPELINE_PRESHUFFLEQUANT 3
-
-template <typename PrecType, ck_tile::index_t M_Warp_Tile>
-constexpr ck_tile::index_t get_k_warp_tile()
-{
-#if defined(__gfx950__)
-    constexpr bool is_8bit_float =
-        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
-    if constexpr(M_Warp_Tile == 32)
-        return is_8bit_float ? 64 : 16;
-    else
-        return is_8bit_float ? 128 : 32;
-#else
-    if constexpr(M_Warp_Tile == 32)
-        return 16;
-    else
-        return 32;
-#endif
-}
-
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-class ArgumentsNotSupportedException : public std::logic_error
-{
-    public:
-    explicit ArgumentsNotSupportedException(const std::string& message) : logic_error(message) {}
-};
-
-struct GemmConfigBase
-{
-    static constexpr bool kPadM = false;
-    static constexpr bool kPadN = false;
-    static constexpr bool kPadK = false;
-
-    static constexpr bool PermuteA = false;
-    static constexpr bool PermuteB = false;
-
-    static constexpr bool TransposeC            = false;
-    static constexpr bool UseStructuredSparsity = false;
-
-    static constexpr int kBlockPerCu                         = 1;
-    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
-    static constexpr ck_tile::index_t TileParitionerM01      = 4;
-    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
-    static constexpr ck_tile::index_t NumWaveGroups = 1;
-    static constexpr bool PreshuffleQuant           = false;
-    static constexpr bool DoubleSmemBuffer          = true;
-};
-
-template <typename PrecType>
-struct GemmConfigDecode : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 16;
-    static constexpr ck_tile::index_t N_Tile = 64;
-    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 4;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_DECODE;
-};
-
-template <typename PrecType>
-struct GemmConfigPrefill : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 4;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr int kBlockPerCu           = 2;
-    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PREFILL;
-};
-
-template <typename PrecType>
-struct GemmConfigPreshuffleQuant : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 16;
-    static constexpr ck_tile::index_t N_Tile = 64;
-    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 4;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile =
-        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLEQUANT;
-    static constexpr bool PreshuffleQuant      = true;
-};
-
-template <typename ADataType_,
-          typename BDataType_ = ADataType_,
-          typename CDataType_ = ADataType_,
-          typename QDataType_ = float>
-struct GemmQuantTypeConfig
-{
-    using ADataType   = ADataType_;
-    using QDataType   = QDataType_;
-    using BDataType   = BDataType_;
-    using AccDataType = float;
-    using CDataType   = CDataType_;
-};
-
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "m dimension")
-        .insert("n", "4096", "n dimension")
-        .insert("k", "2048", "k dimension")
-        .insert("a_layout", "R", "A tensor data layout - Row by default")
-        .insert("aq_layout", "R", "Aq tensor data layout - Row by default")
-        .insert("b_layout", "C", "B tensor data layout - Column by default")
-        .insert("c_layout", "R", "C tensor data layout - Row by default")
-        .insert("stride_a", "0", "Tensor A stride")
-        .insert("stride_q", "0", "Tensor AQ stride")
-        .insert("stride_b", "0", "Tensor B stride")
-        .insert("stride_c", "0", "Tensor C stride")
-        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
-        .insert("prec", "i4fp8", "data type. fp8/bf8/i4fp8/i4bf8/i4f32fp8/i4f32bf8")
-        .insert("warmup", "50", "number of iterations before benchmark the kernel")
-        .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
-        .insert("split_k", "1", "splitK value")
-        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
-        .insert("persistent", "0", "0:non-persistent, 1:persistent")
-        .insert("as_br_cr", "false", "Choose between as_br_cr and as_bs_cr");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
-// host API
-float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_base.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_base.hpp
new file mode 100644
index 0000000000..355e9fce32
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_base.hpp
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <stdexcept>
+#include <gtest/gtest.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/host/check_err.hpp"
+#include "ck_tile/host/reference/reference_gemm.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm_quant.hpp"
+
+// Forward declarations for quant type-specific implementations
+template <ck_tile::QuantType QT>
+struct QuantTypeTraits;
+
+// Base class for common quant gemm functionality
+template <typename Tuple, typename Derived>
+class TestCkTileGemmQuantBase : public ::testing::Test
+{
+    protected:
+    using ALayout                            = std::tuple_element_t<0, Tuple>;
+    using BLayout                            = std::tuple_element_t<1, Tuple>;
+    using CLayout                            = std::tuple_element_t<2, Tuple>;
+    using ADataType                          = std::tuple_element_t<3, Tuple>;
+    using BDataType                          = std::tuple_element_t<4, Tuple>;
+    using QDataType                          = std::tuple_element_t<5, Tuple>;
+    using CDataType                          = std::tuple_element_t<6, Tuple>;
+    static constexpr auto QuantType          = std::tuple_element_t<7, Tuple>::value;
+    using GemmConfig                         = std::tuple_element_t<8, Tuple>;
+    static constexpr uint32_t QuantGroupSize = std::tuple_element_t<9, Tuple>::value;
+    using AccDataType                        = float; // accumulate always in float
+
+    // Get the quant-type specific data types from traits
+    using QuantTraits     = QuantTypeTraits<QuantType>;
+    using ComputeDataType = typename QuantTraits::template ComputeDataType<ADataType, BDataType>;
+
+    static constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
+    static constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
+    static constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
+
+    static constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp;
+    static constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp;
+    static constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
+    static constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
+    static constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
+    static constexpr bool PreshuffleQuant         = GemmConfig::PreshuffleQuant;
+    static constexpr bool PreshuffleB             = GemmConfig::PreshuffleB;
+    static constexpr bool DoubleSmemBuffer        = GemmConfig::DoubleSmemBuffer;
+
+    public:
+    void SetUp() override { static_cast<Derived*>(this)->SetUpQuantTypeSpecific(); }
+
+    void TearDown() override { static_cast<Derived*>(this)->TearDownQuantTypeSpecific(); }
+
+    // Common test execution logic
+    void invoke_quant_gemm(const ck_tile::QuantGemmHostArgs& args, const ck_tile::stream_config& s)
+    {
+        constexpr bool kPadM = false;
+        constexpr bool kPadN = false;
+        constexpr bool kPadK = false;
+
+        using CodegenGemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+        using CodegenGemmTraits = ck_tile::TileGemmQuantTraits<kPadM,
+                                                               kPadN,
+                                                               kPadK,
+                                                               PreshuffleQuant,
+                                                               PreshuffleB,
+                                                               ALayout,
+                                                               BLayout,
+                                                               CLayout,
+                                                               QuantType,
+                                                               ALayout,
+                                                               BLayout,
+                                                               DoubleSmemBuffer>;
+
+        // Let the derived class create the appropriate pipeline and epilogue
+        static_cast<Derived*>(this)
+            ->template run_quant_gemm_impl<CodegenGemmShape, TilePartitioner, CodegenGemmTraits>(
+                args, s);
+    }
+
+    void RunTest(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K)
+    {
+        // Generate test data and run the kernel
+        static_cast<Derived*>(this)->run_test_with_validation(M, N, K);
+    }
+
+    // Helper function to check layout
+    template <typename Layout>
+    static constexpr auto is_row_major(Layout)
+    {
+        return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(Layout{})>,
+                                                     ck_tile::tensor_layout::gemm::RowMajor>>{};
+    }
+
+    // Tolerance calculation function for validation
+    template <typename ADataType_, typename BDataType_, typename AccDataType_, typename CDataType_>
+    auto calculate_rtol_atol(const ck_tile::index_t K,
+                             const ck_tile::index_t kbatch,
+                             const float max_accumulated_value)
+    {
+        using ComputeType =
+            std::conditional_t<sizeof(ADataType_) < sizeof(BDataType_), ADataType_, BDataType_>;
+        // Calculate thresholds
+        const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType_, AccDataType_>(
+            ck_tile::integer_divide_ceil(K, kbatch));
+        const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType_, AccDataType_>(
+            max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+        // Calculate error due to split_k accumulation
+        const auto rtol_split_k =
+            ck_tile::get_relative_threshold<CDataType_, CDataType_, CDataType_>(kbatch);
+        const auto atol_split_k =
+            ck_tile::get_absolute_threshold<CDataType_, CDataType_, CDataType_>(
+                max_accumulated_value, kbatch);
+        // Use higher threshold
+        return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+    }
+
+    template <typename T>
+    auto shuffle_b(const ck_tile::HostTensor<T>& t)
+    {
+        assert(t.get_lengths().size() == 2);
+        int n_                = t.get_lengths()[1];
+        int k_                = t.get_lengths()[0];
+        constexpr int divisor = N_Warp_Tile == 32 ? 2 : 4;
+        ck_tile::HostTensor<T> t_view(
+            {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+};
+
+// Define generic QuantTypeTraits template (will be specialized)
+template <ck_tile::QuantType QT>
+struct QuantTypeTraits
+{
+    static_assert(QT == ck_tile::QuantType::AQuantGrouped ||
+                      QT == ck_tile::QuantType::BQuantGrouped ||
+                      QT == ck_tile::QuantType::RowColQuant ||
+                      QT == ck_tile::QuantType::TensorQuant,
+                  "Unsupported quantization type");
+};
+
+// Specialization for AQuantGrouped
+template <>
+struct QuantTypeTraits<ck_tile::QuantType::AQuantGrouped>
+{
+    template <typename ADataType, typename BDataType>
+    using ComputeDataType = BDataType; // For AQuant, compute type is BDataType
+
+    static constexpr const char* name = "aquant";
+};
+
+// Specialization for BQuantGrouped
+template <>
+struct QuantTypeTraits<ck_tile::QuantType::BQuantGrouped>
+{
+    template <typename ADataType, typename BDataType>
+    using ComputeDataType = ADataType; // For BQuant, compute type is ADataType
+
+    static constexpr const char* name = "bquant";
+};
+
+// Specialization for RowColQuant
+template <>
+struct QuantTypeTraits<ck_tile::QuantType::RowColQuant>
+{
+    template <typename ADataType, typename BDataType>
+    using ComputeDataType = ADataType; // For RowColQuant, compute type is ADataType
+
+    static constexpr const char* name = "rowcol";
+};
+
+// Specialization for TensorQuant
+template <>
+struct QuantTypeTraits<ck_tile::QuantType::TensorQuant>
+{
+    template <typename ADataType, typename BDataType>
+    using ComputeDataType = ADataType; // For TensorQuant, compute type is ADataType
+
+    static constexpr const char* name = "tensor";
+};
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp
new file mode 100644
index 0000000000..98f88f4d53
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp
@@ -0,0 +1,975 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "test_gemm_quant_base.hpp"
+#include "ck_tile/host/permute_pk_int4.hpp"
+
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool PreshuffleQuant           = false;
+    static constexpr bool PreshuffleB               = false;
+    static constexpr bool DoubleSmemBuffer          = false;
+
+    // Default GEMM tile sizes for tests
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256;
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 32;
+};
+
+struct GemmConfigPreshuffleB
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool PreshuffleQuant           = false;
+    static constexpr bool PreshuffleB               = true;
+    static constexpr bool DoubleSmemBuffer          = true;
+
+    // Default GEMM tile sizes for tests
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256;
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 64;
+};
+
+template <typename Tuple>
+class TestCkTileGemmAQuant : public TestCkTileGemmQuantBase<Tuple, TestCkTileGemmAQuant<Tuple>>
+{
+    using Base = TestCkTileGemmQuantBase<Tuple, TestCkTileGemmAQuant<Tuple>>;
+    friend Base;
+
+    public:
+    using typename Base::AccDataType;
+    using typename Base::ADataType;
+    using typename Base::ALayout;
+    using typename Base::BDataType;
+    using typename Base::BLayout;
+    using typename Base::CDataType;
+    using typename Base::CLayout;
+    using typename Base::ComputeDataType;
+    using typename Base::QDataType;
+
+    static constexpr auto QuantType          = Base::QuantType;
+    static constexpr uint32_t QuantGroupSize = Base::QuantGroupSize;
+
+    protected:
+    void SetUpQuantTypeSpecific() {}
+    void TearDownQuantTypeSpecific() {}
+
+    // AQuant-specific data generation
+    void run_test_with_validation(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K)
+    {
+        const ck_tile::index_t stride_A = K;
+        const ck_tile::index_t stride_B = K;
+        const ck_tile::index_t stride_C = M;
+
+        // AQuant uses grouped quantization for A matrix
+        const ck_tile::index_t AQK = ck_tile::integer_divide_ceil(K, QuantGroupSize);
+        const ck_tile::index_t stride_AQ =
+            ck_tile::get_default_stride(M, AQK, 0, this->is_row_major(ALayout{}));
+
+        // Generate test data
+        ck_tile::HostTensor<ADataType> a_m_k(
+            ck_tile::host_tensor_descriptor(M, K, stride_A, this->is_row_major(ALayout{})));
+        ck_tile::HostTensor<QDataType> aq_m_aqk(
+            ck_tile::host_tensor_descriptor(M, AQK, stride_AQ, this->is_row_major(ALayout{})));
+        ck_tile::HostTensor<BDataType> b_k_n(
+            ck_tile::host_tensor_descriptor(K, N, stride_B, this->is_row_major(BLayout{})));
+
+        // Initialize data with random values
+        if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-5.0f, 5.0f}(a_m_k);
+        }
+        else
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-2.0f, 3.0f}(a_m_k);
+        }
+        ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f}(b_k_n);
+        ck_tile::FillUniformDistribution<QDataType>{-2.0f, 2.0f}(aq_m_aqk);
+
+        // Allocate device memory
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size() * sizeof(ADataType));
+        ck_tile::DeviceMem aq_m_aqk_dev_buf(aq_m_aqk.get_element_space_size() * sizeof(QDataType));
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size() * sizeof(BDataType));
+        ck_tile::DeviceMem c_m_n_dev_buf(M * N * sizeof(CDataType));
+
+        // Copy to device
+        if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<ADataType> temp = a_m_k;
+            ck_tile::permute_vectors_i4x4_b(temp);
+            a_m_k_dev_buf.ToDevice(temp.data());
+        }
+        else
+        {
+            a_m_k_dev_buf.ToDevice(a_m_k.data());
+        }
+        aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+
+        // Create args for kernel execution
+        ck_tile::QuantGemmHostArgs args{
+            a_m_k_dev_buf.GetDeviceBuffer(),    // a_ptr
+            b_k_n_dev_buf.GetDeviceBuffer(),    // b_ptr
+            c_m_n_dev_buf.GetDeviceBuffer(),    // c_ptr
+            aq_m_aqk_dev_buf.GetDeviceBuffer(), // aq_ptr (scales)
+            nullptr,                            // bq_ptr (not used for AQuant)
+            1,                                  // k_batch
+            M,
+            N,
+            K,   // M, N, K
+            AQK, // QK_A
+            0,   // QK_B (not used for AQuant)
+            stride_A,
+            stride_B,
+            stride_C,
+            stride_AQ,
+            0 // strides
+        };
+
+        // Run the kernel
+        ck_tile::stream_config stream_config{};
+        this->invoke_quant_gemm(args, stream_config);
+
+        // Validation using reference implementation
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, this->is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        // Run reference AQuant implementation
+        ck_tile::reference_gemm_quant<ADataType,
+                                      QDataType,
+                                      BDataType,
+                                      AccDataType,
+                                      CDataType,
+                                      QuantGroupSize,
+                                      true>(a_m_k, aq_m_aqk, b_k_n, c_m_n_host_ref);
+
+        // Get device result
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, this->is_row_major(CLayout{})));
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.mData.data());
+
+        // Calculate error tolerances
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol =
+            this->template calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                K, 1, max_accumulated_value);
+
+        // Validate results
+        bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                       c_m_n_host_ref,
+                                       "Error: Incorrect results!",
+                                       rtol_atol.at(ck_tile::number<0>{}),
+                                       rtol_atol.at(ck_tile::number<1>{}));
+
+        EXPECT_TRUE(pass) << "AQuantGrouped validation failed with M=" << M << ", N=" << N
+                          << ", K=" << K;
+
+        if(!pass)
+        {
+            std::cout << "AQuantGrouped - Relative error threshold: "
+                      << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+    }
+
+    private:
+    // AQuant-specific pipeline implementation
+    template <typename CodegenGemmShape, typename TilePartitioner, typename CodegenGemmTraits>
+    void run_quant_gemm_impl(const ck_tile::QuantGemmHostArgs& args,
+                             const ck_tile::stream_config& s)
+    {
+        using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
+                                                                     BDataType,
+                                                                     AccDataType,
+                                                                     CodegenGemmShape,
+                                                                     CodegenGemmTraits,
+                                                                     ComputeDataType>;
+
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+        const ck_tile::index_t K_split  = (args.K + Base::K_Tile - 1) / Base::K_Tile * Base::K_Tile;
+        const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop         = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+            constexpr bool transpose_c    = false;
+
+            using PipelineProblem =
+                ck_tile::GemmAQuantPipelineProblem<ADataType,
+                                                   QDataType,
+                                                   BDataType,
+                                                   AccDataType,
+                                                   CodegenGemmShape,
+                                                   CodegenGemmTraits,
+                                                   QuantGroupSize,
+                                                   transpose_c,
+                                                   ComputeDataType,
+                                                   ck_tile::GemmPipelineScheduler::Intrawave,
+                                                   has_hot_loop_v,
+                                                   tail_number_v>;
+
+            using GemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<PipelineProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 ck_tile::tuple<>,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 ck_tile::tuple<>,
+                                                 CLayout,
+                                                 ck_tile::element_wise::PassThrough,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 Base::M_Warp,
+                                                 Base::N_Warp,
+                                                 Base::M_Warp_Tile,
+                                                 Base::N_Warp_Tile,
+                                                 Base::K_Warp_Tile,
+                                                 transpose_c,
+                                                 ck_tile::memory_operation_enum::set>>;
+
+            using Kernel = ck_tile::QuantGemmKernel<TilePartitioner,
+                                                    GemmPipeline,
+                                                    GemmEpilogue,
+                                                    ck_tile::QuantType::AQuantGrouped>;
+
+            auto kargs        = Kernel::MakeKernelArgs(args);
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Arguments not supported for AQuant kernel");
+            }
+
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<GemmConfigBase::kBlockPerCu>(
+                                       Kernel{}, grids, blocks, 0, kargs));
+        };
+
+        return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+    }
+};
+
+// BQuant-specific test fixture
+template <typename Tuple>
+class TestCkTileGemmBQuant : public TestCkTileGemmQuantBase<Tuple, TestCkTileGemmBQuant<Tuple>>
+{
+    using Base = TestCkTileGemmQuantBase<Tuple, TestCkTileGemmBQuant<Tuple>>;
+    friend Base;
+
+    public:
+    using typename Base::AccDataType;
+    using typename Base::ADataType;
+    using typename Base::ALayout;
+    using typename Base::BDataType;
+    using typename Base::BLayout;
+    using typename Base::CDataType;
+    using typename Base::CLayout;
+    using typename Base::ComputeDataType;
+    using typename Base::QDataType;
+
+    static constexpr auto QuantType          = Base::QuantType;
+    static constexpr uint32_t QuantGroupSize = Base::QuantGroupSize;
+    static constexpr auto PreshuffleB        = Base::PreshuffleB;
+
+    protected:
+    void SetUpQuantTypeSpecific() {}
+    void TearDownQuantTypeSpecific() {}
+
+    void run_test_with_validation(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K)
+    {
+        const ck_tile::index_t stride_A = K;
+        const ck_tile::index_t stride_B = K;
+        const ck_tile::index_t stride_C = M;
+
+        // BQuant uses grouped quantization for B matrix
+        const ck_tile::index_t BQK       = ck_tile::integer_divide_ceil(K, QuantGroupSize);
+        const ck_tile::index_t stride_BQ = BQK;
+
+        // Generate test data
+        ck_tile::HostTensor<ADataType> a_m_k(
+            ck_tile::host_tensor_descriptor(M, K, stride_A, this->is_row_major(ALayout{})));
+        ck_tile::HostTensor<BDataType> b_k_n(
+            ck_tile::host_tensor_descriptor(K, N, stride_B, this->is_row_major(BLayout{})));
+        ck_tile::HostTensor<QDataType> bq_bqk_n(
+            ck_tile::host_tensor_descriptor(BQK, N, stride_BQ, this->is_row_major(BLayout{})));
+
+        // Initialize data with random values
+        ck_tile::FillUniformDistribution<ADataType>{-0.5f, 0.5f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{0.f, 1.f}(b_k_n);
+        ck_tile::FillUniformDistribution<QDataType>{0.001f, 0.01f}(bq_bqk_n);
+
+        // Allocate device memory
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size() * sizeof(ADataType));
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size() * sizeof(BDataType));
+        ck_tile::DeviceMem bq_bqk_n_dev_buf(bq_bqk_n.get_element_space_size() * sizeof(QDataType));
+        ck_tile::DeviceMem c_m_n_dev_buf(M * N * sizeof(CDataType));
+
+        // Copy to device
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            if constexpr(PreshuffleB)
+            {
+                b_k_n_dev = this->shuffle_b(b_k_n);
+            }
+            ck_tile::permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            if constexpr(PreshuffleB)
+            {
+                b_k_n_dev = this->shuffle_b(b_k_n);
+            }
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        bq_bqk_n_dev_buf.ToDevice(bq_bqk_n.data());
+
+        // Create args for kernel execution
+        ck_tile::QuantGemmHostArgs args{
+            a_m_k_dev_buf.GetDeviceBuffer(),    // a_ptr
+            b_k_n_dev_buf.GetDeviceBuffer(),    // b_ptr
+            c_m_n_dev_buf.GetDeviceBuffer(),    // c_ptr
+            nullptr,                            // aq_ptr (not used for BQuant)
+            bq_bqk_n_dev_buf.GetDeviceBuffer(), // bq_ptr (scales)
+            1,                                  // k_batch
+            M,
+            N,
+            K,   // M, N, K
+            0,   // QK_A (not used for BQuant)
+            BQK, // QK_B
+            stride_A,
+            stride_B,
+            stride_C,
+            0,
+            stride_BQ // strides
+        };
+
+        // Run the kernel
+        ck_tile::stream_config stream_config{};
+        this->invoke_quant_gemm(args, stream_config);
+
+        // Validation using reference implementation
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, this->is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        // Run reference BQuant implementation
+        ck_tile::reference_gemm_quant<ADataType,
+                                      QDataType,
+                                      BDataType,
+                                      AccDataType,
+                                      CDataType,
+                                      QuantGroupSize,
+                                      false>(a_m_k, bq_bqk_n, b_k_n, c_m_n_host_ref);
+
+        // Get device result
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, this->is_row_major(CLayout{})));
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.mData.data());
+
+        // Calculate error tolerances
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol =
+            this->template calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                K, 1, max_accumulated_value);
+
+        // Validate results
+        bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                       c_m_n_host_ref,
+                                       "Error: Incorrect results!",
+                                       rtol_atol.at(ck_tile::number<0>{}),
+                                       rtol_atol.at(ck_tile::number<1>{}));
+
+        EXPECT_TRUE(pass) << "BQuantGrouped validation failed with M=" << M << ", N=" << N
+                          << ", K=" << K;
+
+        if(!pass)
+        {
+            std::cout << "BQuantGrouped - Relative error threshold: "
+                      << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+    }
+
+    private:
+    // BQuant-specific pipeline implementation
+    template <typename CodegenGemmShape, typename TilePartitioner, typename CodegenGemmTraits>
+    void run_quant_gemm_impl(const ck_tile::QuantGemmHostArgs& args,
+                             const ck_tile::stream_config& s)
+    {
+        using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
+                                                                     BDataType,
+                                                                     AccDataType,
+                                                                     CodegenGemmShape,
+                                                                     CodegenGemmTraits,
+                                                                     ComputeDataType>;
+
+        using BaseGemmPipeline = std::conditional_t<
+            PreshuffleB == false,
+            ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>,
+            ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<GemmPipelineProblem>>;
+
+        const ck_tile::index_t K_split  = (args.K + Base::K_Tile - 1) / Base::K_Tile * Base::K_Tile;
+        const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop         = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+
+            using PipelineProblem =
+                ck_tile::GemmBQuantPipelineProblem<ADataType,
+                                                   BDataType,
+                                                   QDataType,
+                                                   AccDataType,
+                                                   CodegenGemmShape,
+                                                   CodegenGemmTraits,
+                                                   QuantGroupSize,
+                                                   ComputeDataType,
+                                                   ck_tile::GemmPipelineScheduler::Intrawave,
+                                                   has_hot_loop_v,
+                                                   tail_number_v>;
+
+            using GemmPipeline =
+                std::conditional_t<PreshuffleB == false,
+                                   ck_tile::BQuantGemmPipelineAgBgCrCompV3<PipelineProblem>,
+                                   ck_tile::WPQuantBPipelineAgBgCrV2<PipelineProblem>>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 ck_tile::tuple<>,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 ck_tile::tuple<>,
+                                                 CLayout,
+                                                 ck_tile::element_wise::PassThrough,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 Base::M_Warp,
+                                                 Base::N_Warp,
+                                                 Base::M_Warp_Tile,
+                                                 Base::N_Warp_Tile,
+                                                 Base::K_Warp_Tile,
+                                                 false, // transpose_c
+                                                 ck_tile::memory_operation_enum::set>>;
+
+            using Kernel = ck_tile::QuantGemmKernel<TilePartitioner,
+                                                    GemmPipeline,
+                                                    GemmEpilogue,
+                                                    ck_tile::QuantType::BQuantGrouped>;
+
+            auto kargs        = Kernel::MakeKernelArgs(args);
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Arguments not supported for BQuant kernel");
+            }
+
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<GemmConfigBase::kBlockPerCu>(
+                                       Kernel{}, grids, blocks, 0, kargs));
+        };
+
+        return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+    }
+};
+
+template <typename Tuple>
+class TestCkTileGemmPreshuffleBBQuant : public TestCkTileGemmBQuant<Tuple>
+{
+};
+
+// RowColQuant-specific test fixture
+template <typename Tuple>
+class TestCkTileGemmRowColQuant
+    : public TestCkTileGemmQuantBase<Tuple, TestCkTileGemmRowColQuant<Tuple>>
+{
+    using Base = TestCkTileGemmQuantBase<Tuple, TestCkTileGemmRowColQuant<Tuple>>;
+    friend Base;
+
+    public:
+    using typename Base::AccDataType;
+    using typename Base::ADataType;
+    using typename Base::ALayout;
+    using typename Base::BDataType;
+    using typename Base::BLayout;
+    using typename Base::CDataType;
+    using typename Base::CLayout;
+    using typename Base::ComputeDataType;
+    using typename Base::QDataType;
+
+    static constexpr auto QuantType          = Base::QuantType;
+    static constexpr uint32_t QuantGroupSize = Base::QuantGroupSize;
+
+    protected:
+    void SetUpQuantTypeSpecific() {}
+    void TearDownQuantTypeSpecific() {}
+
+    void run_test_with_validation(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K)
+    {
+        const ck_tile::index_t stride_A = K;
+        const ck_tile::index_t stride_B = K;
+        const ck_tile::index_t stride_C = M;
+
+        // RowColQuant uses per-row and per-column scales
+        const ck_tile::index_t stride_row_scales = 1;
+        const ck_tile::index_t stride_col_scales = 1;
+
+        // Generate test data
+        ck_tile::HostTensor<ADataType> a_m_k(
+            ck_tile::host_tensor_descriptor(M, K, stride_A, this->is_row_major(ALayout{})));
+        ck_tile::HostTensor<BDataType> b_k_n(
+            ck_tile::host_tensor_descriptor(K, N, stride_B, this->is_row_major(BLayout{})));
+        ck_tile::HostTensor<QDataType> row_scales_m(ck_tile::host_tensor_descriptor(
+            M, 1, stride_row_scales, ck_tile::bool_constant<true>{}));
+        ck_tile::HostTensor<QDataType> col_scales_n(ck_tile::host_tensor_descriptor(
+            N, 1, stride_col_scales, ck_tile::bool_constant<true>{}));
+
+        // Initialize data with random values
+        ck_tile::FillUniformDistribution<ADataType>{-0.5f, 0.5f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-0.5f, 0.5f}(b_k_n);
+        ck_tile::FillUniformDistribution<QDataType>{0.001f, 0.01f}(row_scales_m);
+        ck_tile::FillUniformDistribution<QDataType>{0.001f, 0.01f}(col_scales_n);
+
+        // Allocate device memory
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size() * sizeof(ADataType));
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size() * sizeof(BDataType));
+        ck_tile::DeviceMem row_scales_dev_buf(row_scales_m.get_element_space_size() *
+                                              sizeof(QDataType));
+        ck_tile::DeviceMem col_scales_dev_buf(col_scales_n.get_element_space_size() *
+                                              sizeof(QDataType));
+        ck_tile::DeviceMem c_m_n_dev_buf(M * N * sizeof(CDataType));
+
+        // Copy to device
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+        row_scales_dev_buf.ToDevice(row_scales_m.data());
+        col_scales_dev_buf.ToDevice(col_scales_n.data());
+
+        // Create args for kernel execution
+        ck_tile::QuantGemmHostArgs args{
+            a_m_k_dev_buf.GetDeviceBuffer(),      // a_ptr
+            b_k_n_dev_buf.GetDeviceBuffer(),      // b_ptr
+            c_m_n_dev_buf.GetDeviceBuffer(),      // c_ptr
+            row_scales_dev_buf.GetDeviceBuffer(), // aq_ptr (row scales)
+            col_scales_dev_buf.GetDeviceBuffer(), // bq_ptr (col scales)
+            1,                                    // k_batch
+            M,
+            N,
+            K, // M, N, K
+            1, // QK_A (row scales)
+            1, // QK_B (col scales)
+            stride_A,
+            stride_B,
+            stride_C,
+            stride_row_scales,
+            stride_col_scales // strides
+        };
+
+        // Run the kernel
+        ck_tile::stream_config stream_config{};
+        this->invoke_quant_gemm(args, stream_config);
+
+        // Validation using reference implementation
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, this->is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        // Run reference RowColQuant implementation
+        ck_tile::reference_gemm_rowcol_quant<ADataType,
+                                             QDataType,
+                                             BDataType,
+                                             QDataType,
+                                             AccDataType,
+                                             CDataType>(
+            a_m_k, row_scales_m, b_k_n, col_scales_n, c_m_n_host_ref);
+
+        // Get device result
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, this->is_row_major(CLayout{})));
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.mData.data());
+
+        // Calculate error tolerances
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol =
+            this->template calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                K, 1, max_accumulated_value);
+
+        // Validate results
+        bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                       c_m_n_host_ref,
+                                       "Error: Incorrect results!",
+                                       rtol_atol.at(ck_tile::number<0>{}),
+                                       rtol_atol.at(ck_tile::number<1>{}));
+
+        EXPECT_TRUE(pass) << "RowColQuant validation failed with M=" << M << ", N=" << N
+                          << ", K=" << K;
+
+        if(!pass)
+        {
+            std::cout << "RowColQuant - Relative error threshold: "
+                      << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+    }
+
+    private:
+    // RowColQuant-specific pipeline implementation
+    template <typename CodegenGemmShape, typename TilePartitioner, typename CodegenGemmTraits>
+    void run_quant_gemm_impl(const ck_tile::QuantGemmHostArgs& args,
+                             const ck_tile::stream_config& s)
+    {
+        using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
+                                                                     BDataType,
+                                                                     AccDataType,
+                                                                     CodegenGemmShape,
+                                                                     CodegenGemmTraits,
+                                                                     ComputeDataType>;
+
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+        const ck_tile::index_t K_split  = (args.K + Base::K_Tile - 1) / Base::K_Tile * Base::K_Tile;
+        const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop         = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+            constexpr bool transpose_c    = false;
+
+            using PipelineProblem = ck_tile::GemmRowColTensorQuantPipelineProblem<
+                ADataType,
+                BDataType,
+                AccDataType,
+                AccDataType,
+                CodegenGemmShape,
+                CodegenGemmTraits,
+                transpose_c,
+                ComputeDataType,
+                ck_tile::GemmPipelineScheduler::Intrawave,
+                has_hot_loop_v,
+                tail_number_v>;
+
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 ck_tile::tuple<>,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 ck_tile::tuple<>,
+                                                 CLayout,
+                                                 ck_tile::element_wise::PassThrough,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 Base::M_Warp,
+                                                 Base::N_Warp,
+                                                 Base::M_Warp_Tile,
+                                                 Base::N_Warp_Tile,
+                                                 Base::K_Warp_Tile,
+                                                 transpose_c,
+                                                 ck_tile::memory_operation_enum::set>>;
+
+            using Kernel = ck_tile::QuantGemmKernel<TilePartitioner,
+                                                    GemmPipeline,
+                                                    GemmEpilogue,
+                                                    ck_tile::QuantType::RowColQuant>;
+
+            auto kargs        = Kernel::MakeKernelArgs(args);
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Arguments not supported for RowColQuant kernel");
+            }
+
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<GemmConfigBase::kBlockPerCu>(
+                                       Kernel{}, grids, blocks, 0, kargs));
+        };
+
+        return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+    }
+};
+
+// TensorQuant-specific test fixture
+template <typename Tuple>
+class TestCkTileGemmTensorQuant
+    : public TestCkTileGemmQuantBase<Tuple, TestCkTileGemmTensorQuant<Tuple>>
+{
+    using Base = TestCkTileGemmQuantBase<Tuple, TestCkTileGemmTensorQuant<Tuple>>;
+    friend Base;
+
+    public:
+    using typename Base::AccDataType;
+    using typename Base::ADataType;
+    using typename Base::ALayout;
+    using typename Base::BDataType;
+    using typename Base::BLayout;
+    using typename Base::CDataType;
+    using typename Base::CLayout;
+    using typename Base::ComputeDataType;
+    using typename Base::QDataType;
+
+    static constexpr auto QuantType          = Base::QuantType;
+    static constexpr uint32_t QuantGroupSize = Base::QuantGroupSize;
+
+    protected:
+    void SetUpQuantTypeSpecific() {}
+    void TearDownQuantTypeSpecific() {}
+
+    void run_test_with_validation(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K)
+    {
+        const ck_tile::index_t stride_A = K;
+        const ck_tile::index_t stride_B = K;
+        const ck_tile::index_t stride_C = M;
+
+        // TensorQuant uses single scalar scale for each tensor
+        const ck_tile::index_t stride_scale_a = 1;
+        const ck_tile::index_t stride_scale_b = 1;
+
+        // Generate test data
+        ck_tile::HostTensor<ADataType> a_m_k(
+            ck_tile::host_tensor_descriptor(M, K, stride_A, this->is_row_major(ALayout{})));
+        ck_tile::HostTensor<BDataType> b_k_n(
+            ck_tile::host_tensor_descriptor(K, N, stride_B, this->is_row_major(BLayout{})));
+        ck_tile::HostTensor<QDataType> scale_a(
+            ck_tile::host_tensor_descriptor(1, 1, stride_scale_a, ck_tile::bool_constant<true>{}));
+        ck_tile::HostTensor<QDataType> scale_b(
+            ck_tile::host_tensor_descriptor(1, 1, stride_scale_b, ck_tile::bool_constant<true>{}));
+
+        // Initialize data with random values
+        ck_tile::FillUniformDistribution<ADataType>{-0.5f, 0.5f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-0.5f, 0.5f}(b_k_n);
+        ck_tile::FillUniformDistribution<QDataType>{0.001f, 0.01f}(scale_a);
+        ck_tile::FillUniformDistribution<QDataType>{0.001f, 0.01f}(scale_b);
+
+        // Allocate device memory
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size() * sizeof(ADataType));
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size() * sizeof(BDataType));
+        ck_tile::DeviceMem scale_a_dev_buf(scale_a.get_element_space_size() * sizeof(QDataType));
+        ck_tile::DeviceMem scale_b_dev_buf(scale_b.get_element_space_size() * sizeof(QDataType));
+        ck_tile::DeviceMem c_m_n_dev_buf(M * N * sizeof(CDataType));
+
+        // Copy to device
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+        scale_a_dev_buf.ToDevice(scale_a.data());
+        scale_b_dev_buf.ToDevice(scale_b.data());
+
+        // Create args for kernel execution
+        ck_tile::QuantGemmHostArgs args{
+            a_m_k_dev_buf.GetDeviceBuffer(),   // a_ptr
+            b_k_n_dev_buf.GetDeviceBuffer(),   // b_ptr
+            c_m_n_dev_buf.GetDeviceBuffer(),   // c_ptr
+            scale_a_dev_buf.GetDeviceBuffer(), // aq_ptr (scale A)
+            scale_b_dev_buf.GetDeviceBuffer(), // bq_ptr (scale B)
+            1,                                 // k_batch
+            M,
+            N,
+            K, // M, N, K
+            1, // QK_A (tensor scale)
+            1, // QK_B (tensor scale)
+            stride_A,
+            stride_B,
+            stride_C,
+            stride_scale_a,
+            stride_scale_b // strides
+        };
+
+        // Run the kernel
+        ck_tile::stream_config stream_config{};
+        this->invoke_quant_gemm(args, stream_config);
+
+        // Validation using reference implementation
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, this->is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        // Run reference TensorQuant implementation
+        ck_tile::reference_gemm_tensor_quant<ADataType,
+                                             QDataType,
+                                             BDataType,
+                                             QDataType,
+                                             AccDataType,
+                                             CDataType>(
+            a_m_k, scale_a, b_k_n, scale_b, c_m_n_host_ref);
+
+        // Get device result
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, this->is_row_major(CLayout{})));
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.mData.data());
+
+        // Calculate error tolerances
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol =
+            this->template calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                K, 1, max_accumulated_value);
+
+        // Validate results
+        bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                       c_m_n_host_ref,
+                                       "Error: Incorrect results!",
+                                       rtol_atol.at(ck_tile::number<0>{}),
+                                       rtol_atol.at(ck_tile::number<1>{}));
+
+        EXPECT_TRUE(pass) << "TensorQuant validation failed with M=" << M << ", N=" << N
+                          << ", K=" << K;
+
+        if(!pass)
+        {
+            std::cout << "TensorQuant - Relative error threshold: "
+                      << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+    }
+
+    private:
+    // TensorQuant-specific pipeline implementation
+    template <typename CodegenGemmShape, typename TilePartitioner, typename CodegenGemmTraits>
+    void run_quant_gemm_impl(const ck_tile::QuantGemmHostArgs& args,
+                             const ck_tile::stream_config& s)
+    {
+        using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
+                                                                     BDataType,
+                                                                     AccDataType,
+                                                                     CodegenGemmShape,
+                                                                     CodegenGemmTraits,
+                                                                     ComputeDataType>;
+
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+        const ck_tile::index_t K_split  = (args.K + Base::K_Tile - 1) / Base::K_Tile * Base::K_Tile;
+        const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop         = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+            constexpr bool transpose_c    = false;
+
+            using PipelineProblem = ck_tile::GemmRowColTensorQuantPipelineProblem<
+                ADataType,
+                BDataType,
+                AccDataType,
+                AccDataType,
+                CodegenGemmShape,
+                CodegenGemmTraits,
+                transpose_c,
+                ComputeDataType,
+                ck_tile::GemmPipelineScheduler::Intrawave,
+                has_hot_loop_v,
+                tail_number_v>;
+
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 ck_tile::tuple<>,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 ck_tile::tuple<>,
+                                                 CLayout,
+                                                 ck_tile::element_wise::PassThrough,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 Base::M_Warp,
+                                                 Base::N_Warp,
+                                                 Base::M_Warp_Tile,
+                                                 Base::N_Warp_Tile,
+                                                 Base::K_Warp_Tile,
+                                                 transpose_c,
+                                                 ck_tile::memory_operation_enum::set>>;
+
+            using Kernel = ck_tile::QuantGemmKernel<TilePartitioner,
+                                                    GemmPipeline,
+                                                    GemmEpilogue,
+                                                    ck_tile::QuantType::TensorQuant>;
+
+            auto kargs        = Kernel::MakeKernelArgs(args);
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Arguments not supported for TensorQuant kernel");
+            }
+
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<GemmConfigBase::kBlockPerCu>(
+                                       Kernel{}, grids, blocks, 0, kargs));
+        };
+
+        return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+    }
+};
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_typed.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_typed.cpp
new file mode 100644
index 0000000000..e131c03189
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_typed.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "test_gemm_quant_fixtures.hpp"
+
+// Type aliases for readability
+using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
+using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
+using FP8           = ck_tile::fp8_t;
+using BF8           = ck_tile::bf8_t;
+using Half          = ck_tile::half_t;
+using PkInt4        = ck_tile::pk_int4_t;
+using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
+using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
+using RowColQuant   = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::RowColQuant>;
+using TensorQuant   = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::TensorQuant>;
+using GroupSize     = std::integral_constant<unsigned int, 128>;
+
+// Type combinations for each quantization type
+// clang-format off
+using AQuantTypes = ::testing::Types<
+    std::tuple<RowMajor, ColumnMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigBase, GroupSize>
+>;
+// clang-format on
+
+// clang-format off
+using BQuantTypes = ::testing::Types<
+    std::tuple<RowMajor, ColumnMajor, RowMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigBase, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, BF8, BF8, float, Half, BQuantGrouped, GemmConfigBase, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigBase, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, BF8, PkInt4, BF8, Half, BQuantGrouped, GemmConfigBase, GroupSize>
+>;
+// clang-format on
+
+// clang-format off
+using BPreshuffleBQuantTypes = ::testing::Types<
+    std::tuple<RowMajor, ColumnMajor, RowMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleB, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, BF8, BF8, float, Half, BQuantGrouped, GemmConfigPreshuffleB, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleB, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, BF8, PkInt4, BF8, Half, BQuantGrouped, GemmConfigPreshuffleB, GroupSize>
+>;
+
+// clang-format off
+using RowColQuantTypes = ::testing::Types<
+    std::tuple<RowMajor, ColumnMajor, RowMajor, FP8, FP8, float, Half, RowColQuant, GemmConfigBase, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, BF8, BF8, float, Half, RowColQuant, GemmConfigBase, GroupSize>
+>;
+// clang-format on
+
+// clang-format off
+using TensorQuantTypes = ::testing::Types<
+    std::tuple<RowMajor, ColumnMajor, RowMajor, FP8, FP8, float, Half, TensorQuant, GemmConfigBase, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, BF8, BF8, float, Half, TensorQuant, GemmConfigBase, GroupSize>
+>;
+// clang-format on
+
+// Test suites for each quantization type
+TYPED_TEST_SUITE(TestCkTileGemmAQuant, AQuantTypes);
+TYPED_TEST_SUITE(TestCkTileGemmBQuant, BQuantTypes);
+TYPED_TEST_SUITE(TestCkTileGemmPreshuffleBBQuant, BPreshuffleBQuantTypes);
+TYPED_TEST_SUITE(TestCkTileGemmRowColQuant, RowColQuantTypes);
+TYPED_TEST_SUITE(TestCkTileGemmTensorQuant, TensorQuantTypes);
+
+#include "test_gemm_quant_ut_cases.inc"
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_ut_cases.inc b/test/ck_tile/gemm_block_scale/test_gemm_quant_ut_cases.inc
new file mode 100644
index 0000000000..042735eccb
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_ut_cases.inc
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmAQuant, AQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
+
+// BQuant tests
+TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
+
+// BQuant tests
+TYPED_TEST(TestCkTileGemmPreshuffleBBQuant, BQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
+// RowColQuant tests
+TYPED_TEST(TestCkTileGemmRowColQuant, RowColQuantTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
+
+// TensorQuant tests
+TYPED_TEST(TestCkTileGemmTensorQuant, TensorQuantTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
diff --git a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
deleted file mode 100644
index 9ed42ff8d2..0000000000
--- a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
+++ /dev/null
@@ -1,602 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <stdexcept>
-#include <string>
-#include <tuple>
-#include <random>
-
-#include "ck_tile/core/config.hpp"
-#include "ck_tile/host.hpp"
-#include "test_gemm_aquant_utils.hpp"
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename AQDataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ComputeDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          uint32_t QuantGroupSize>
-float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
-{
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu = 1;
-
-    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
-
-    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
-    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
-    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
-
-    constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp;
-    constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp;
-    constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
-
-    using CodegenGemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
-
-    using CodegenGemmTraits = ck_tile::TileGemmAQuantTraits<kPadM,
-                                                            kPadN,
-                                                            kPadK,
-                                                            GemmConfig::PreshuffleQuant,
-                                                            ALayout,
-                                                            BLayout,
-                                                            CLayout>;
-
-    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
-                                                                 BDataType,
-                                                                 AccDataType,
-                                                                 CodegenGemmShape,
-                                                                 CodegenGemmTraits,
-                                                                 ComputeDataType>;
-
-    using BaseGemmPipeline = ck_tile::BaseAQuantGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
-
-    const ck_tile::index_t K_split      = (args.K + K_Tile - 1) / K_Tile * K_Tile;
-    const ck_tile::index_t num_loop     = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop             = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num  = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-    constexpr bool transposed_warp_gemm = false;
-
-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-
-        using CodegenPipelineProblem =
-            ck_tile::GemmAQuantPipelineProblem<ADataType,
-                                               AQDataType,
-                                               BDataType,
-                                               AccDataType,
-                                               CodegenGemmShape,
-                                               CodegenGemmTraits,
-                                               QuantGroupSize,
-                                               transposed_warp_gemm,
-                                               ComputeDataType,
-                                               ck_tile::GemmPipelineScheduler::Intrawave,
-                                               has_hot_loop_v,
-                                               tail_number_v>;
-        using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
-        using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-                   ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                    BDataType,
-                                                    ck_tile::tuple<>,
-                                                    AccDataType,
-                                                    CDataType,
-                                                    ck_tile::tuple<>,
-                                                    CLayout,
-                                                    ck_tile::element_wise::PassThrough,
-                                                    TilePartitioner::MPerBlock,
-                                                    TilePartitioner::NPerBlock,
-                                                    M_Warp,
-                                                    N_Warp,
-                                                    M_Warp_Tile,
-                                                    N_Warp_Tile,
-                                                    K_Warp_Tile,
-                                                    transposed_warp_gemm,
-                                                    ck_tile::memory_operation_enum::set>>;
-        using Kernel =
-            ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
-
-        auto kargs = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(args.k_batch != 1)
-        {
-            throw std::runtime_error("split-k is not supported yet!");
-        }
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenGemmShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
-}
-
-template <typename Layout>
-static constexpr inline auto is_row_major(Layout layout_)
-{
-    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
-                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename AQDataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename AQLayout,
-          typename BLayout,
-          typename CLayout,
-          uint32_t QuantGroupSize>
-float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
-                  ck_tile::DeviceMem& aq_m_aqk_dev_buf,
-                  ck_tile::DeviceMem& b_k_n_dev_buf,
-                  ck_tile::DeviceMem& c_m_n_dev_buf,
-                  ck_tile::index_t M,
-                  ck_tile::index_t N,
-                  ck_tile::index_t K,
-                  ck_tile::index_t AQK,
-                  ck_tile::index_t stride_A,
-                  ck_tile::index_t stride_AQ,
-                  ck_tile::index_t stride_B,
-                  ck_tile::index_t stride_C,
-                  ck_tile::index_t kbatch,
-                  int n_warmup,
-                  int n_repeat)
-{
-    ck_tile::AQuantGemmHostArgs args;
-    args.a_ptr     = a_m_k_dev_buf.GetDeviceBuffer();
-    args.aq_ptr    = aq_m_aqk_dev_buf.GetDeviceBuffer();
-    args.b_ptr     = b_k_n_dev_buf.GetDeviceBuffer();
-    args.c_ptr     = c_m_n_dev_buf.GetDeviceBuffer();
-    args.k_batch   = kbatch;
-    args.M         = M;
-    args.N         = N;
-    args.K         = K;
-    args.QK        = AQK;
-    args.stride_A  = stride_A;
-    args.stride_B  = stride_B;
-    args.stride_C  = stride_C;
-    args.stride_AQ = stride_AQ;
-
-    float ave_time = gemm_calc_aquant<GemmConfig,
-                                      ADataType,
-                                      AQDataType,
-                                      BDataType,
-                                      AccDataType,
-                                      CDataType,
-                                      BDataType,
-                                      ALayout,
-                                      BLayout,
-                                      CLayout,
-                                      QuantGroupSize>(
-        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
-
-    std::size_t flop     = std::size_t(2) * M * N * K;
-    std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(AQDataType) * M * AQK +
-                           sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideAQ =" << stride_AQ << " StrideB =" << stride_B
-              << " StrideC =" << stride_C << " A_Layout =" << ALayout::name
-              << " B_Layout =" << BLayout::name << " C_Layout =" << CLayout::name
-              << " A_Type = " << DataTypeTraits<ADataType>::name
-              << " AQ_Type = " << DataTypeTraits<AQDataType>::name
-              << " B_Type = " << DataTypeTraits<BDataType>::name
-              << " Acc_Type = " << DataTypeTraits<AccDataType>::name
-              << " C_Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
-
-    return ave_time;
-}
-
-template <typename GemmConfig,
-          typename TypeConfig,
-          uint32_t QuantGroupSize,
-          typename ALayout,
-          typename AQLayout,
-          typename BLayout,
-          typename CLayout>
-bool run_gemm_test_with_layouts(int argc,
-                                char* argv[],
-                                const ALayout a_layout                  = ALayout{},
-                                const AQLayout aq_layout                = AQLayout{},
-                                const BLayout b_layout                  = BLayout{},
-                                [[maybe_unused]] const CLayout c_layout = CLayout{})
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return false;
-
-    using ADataType   = typename TypeConfig::ADataType;
-    using AQDataType  = typename TypeConfig::QDataType;
-    using BDataType   = typename TypeConfig::BDataType;
-    using AccDataType = typename TypeConfig::AccDataType;
-    using CDataType   = typename TypeConfig::CDataType;
-
-    ck_tile::index_t M = arg_parser.get_int("m");
-    ck_tile::index_t N = arg_parser.get_int("n");
-    ck_tile::index_t K = arg_parser.get_int("k");
-
-    if(K % QuantGroupSize != 0)
-    {
-        throw std::runtime_error("K must be aligned with QuantGroupSize");
-    }
-
-    ck_tile::index_t AQK = K / QuantGroupSize;
-
-    ck_tile::index_t stride_A  = arg_parser.get_int("stride_a");
-    ck_tile::index_t stride_AQ = arg_parser.get_int("stride_q");
-    ck_tile::index_t stride_B  = arg_parser.get_int("stride_b");
-    ck_tile::index_t stride_C  = arg_parser.get_int("stride_c");
-
-    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
-    int n_warmup                 = arg_parser.get_int("warmup");
-    int n_repeat                 = arg_parser.get_int("repeat");
-    ck_tile::index_t init_method = arg_parser.get_int("init");
-
-    stride_A  = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
-    stride_AQ = ck_tile::get_default_stride(M, AQK, stride_AQ, is_row_major(aq_layout));
-    stride_B  = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
-    stride_C  = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
-
-    ck_tile::HostTensor<ADataType> a_m_k(
-        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
-    ck_tile::HostTensor<AQDataType> aq_m_aqk(
-        ck_tile::host_tensor_descriptor(M, AQK, stride_AQ, is_row_major(aq_layout)));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<std::uint32_t> fill_seed(0, 500);
-
-    if(init_method == 0)
-    {
-        if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
-        {
-            ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
-                a_m_k);
-        }
-        else
-        {
-            ck_tile::FillUniformDistribution<ADataType>{-2.0f, 3.0f, fill_seed(gen)}(a_m_k);
-        }
-        ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(aq_m_aqk);
-        ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f, fill_seed(gen)}(b_k_n);
-    }
-    else if(init_method == 1)
-    {
-        std::cout << "Monotonic initialization is not supported." << std::endl;
-        return true;
-    }
-    else if(init_method == 2)
-    {
-        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x22)}(a_m_k);
-        ck_tile::FillConstant<AQDataType>{static_cast<AQDataType>(0.5f)}(aq_m_aqk);
-        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x38)}(b_k_n);
-    }
-    else
-    {
-        a_m_k.SetZero();
-        aq_m_aqk.SetZero();
-        b_k_n.SetZero();
-    }
-
-    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem aq_m_aqk_dev_buf(aq_m_aqk.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
-
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
-    b_k_n_dev_buf.ToDevice(b_k_n.data());
-    c_m_n_dev_buf.SetZero();
-    c_m_n_dev_result.SetZero();
-
-    invoke_gemm<GemmConfig,
-                ADataType,
-                AQDataType,
-                BDataType,
-                AccDataType,
-                CDataType,
-                ALayout,
-                AQLayout,
-                BLayout,
-                CLayout,
-                QuantGroupSize>(a_m_k_dev_buf,
-                                aq_m_aqk_dev_buf,
-                                b_k_n_dev_buf,
-                                c_m_n_dev_buf,
-                                M,
-                                N,
-                                K,
-                                AQK,
-                                stride_A,
-                                stride_AQ,
-                                stride_B,
-                                stride_C,
-                                kbatch,
-                                n_warmup,
-                                n_repeat);
-
-    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-    bool pass = true;
-
-    if(arg_parser.get_int("v") == 1)
-    {
-        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        c_m_n_host_ref.SetZero();
-
-        ck_tile::reference_gemm_quant<ADataType,
-                                      AQDataType,
-                                      BDataType,
-                                      AccDataType,
-                                      CDataType,
-                                      QuantGroupSize,
-                                      true>(a_m_k, aq_m_aqk, b_k_n, c_m_n_host_ref);
-        const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_host_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        if(!pass)
-        {
-            std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                      << std::endl;
-        }
-        std::cout << "CPU verification " << (pass ? "Passed!" : "Failed ...") << std::endl;
-    }
-    else if(arg_parser.get_int("v") == 2)
-    {
-        std::cout << "GPU verification is not implemented yet. Re-run with -v=1" << std::endl;
-        return false;
-    }
-
-    return pass;
-}
-
-template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
-bool run_gemm_test_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
-{
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
-                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
-                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
-    {
-        if(a_layout == "R" && b_layout == "C")
-        {
-            return run_gemm_test_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
-                argc, argv, Row{}, Row{}, Col{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported memory layout for the input matrices!");
-        }
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data type for A.");
-    }
-
-    return true;
-}
-
-template <template <typename PreType> typename GemmConfig>
-bool run_gemm_test(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return false;
-
-    std::string data_type = arg_parser.get_str("prec");
-    std::string a_layout  = arg_parser.get_str("a_layout");
-    std::string b_layout  = arg_parser.get_str("b_layout");
-
-    if(data_type == "fp8")
-    {
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
-        return run_gemm_test_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "bf8")
-    {
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
-        return run_gemm_test_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "i4fp8")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
-                                                        ck_tile::fp8_t,
-                                                        ck_tile::half_t,
-                                                        ck_tile::fp8_t>{});
-        return run_gemm_test_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "i4bf8")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
-                                                        ck_tile::bf8_t,
-                                                        ck_tile::half_t,
-                                                        ck_tile::bf8_t>{});
-        return run_gemm_test_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "i4f32fp8")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
-                                                        ck_tile::fp8_t,
-                                                        ck_tile::half_t,
-                                                        float>{});
-        return run_gemm_test_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else if(data_type == "i4f32bf8")
-    {
-        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
-                                                        ck_tile::bf8_t,
-                                                        ck_tile::half_t,
-                                                        float>{});
-        return run_gemm_test_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
-            a_layout, b_layout, argc, argv);
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data type for this operation !!!");
-    }
-}
-
-int run_gemm_combinations(std::string const& data_type)
-{
-    // Define possible values for each parameter
-    std::vector<std::vector<std::string>> mnk_values = {{
-                                                            "1",
-                                                            "2048",
-                                                            "5120",
-                                                        },
-                                                        {
-                                                            "2",
-                                                            "2048",
-                                                            "5120",
-                                                        },
-                                                        {
-                                                            "16",
-                                                            "2048",
-                                                            "5120",
-                                                        },
-                                                        {
-                                                            "17",
-                                                            "2048",
-                                                            "5120",
-                                                        },
-                                                        {
-                                                            "2047",
-                                                            "5120",
-                                                            "1024",
-                                                        },
-                                                        {
-                                                            "2048",
-                                                            "5120",
-                                                            "1024",
-                                                        }};
-    std::vector<std::string> prec_values             = {data_type};
-
-    // We'll store all our arguments as strings first
-    std::vector<std::string> arg_strings = {"test_tile_gemm_aquant_basic",
-                                            "", // m placeholder
-                                            "", // n placeholder
-                                            "", // k placeholder
-                                            "", // prec placeholder
-                                            "-init=0",
-                                            "-v=1",
-                                            "-warmup=0",
-                                            "-repeat=1"};
-
-    // Create an array of const char pointers for argv
-    constexpr size_t ARG_COUNT   = 9;
-    constexpr size_t ARG_MAX_LEN = 64;
-    char args[ARG_COUNT][ARG_MAX_LEN];
-    char* argv[ARG_COUNT];
-
-    // Run all combinations
-    bool is_success = true;
-    for(const auto& mnk : mnk_values)
-    {
-        arg_strings[1] = "-m=" + mnk[0];
-        arg_strings[2] = "-n=" + mnk[1];
-        arg_strings[3] = "-k=" + mnk[2];
-
-        for(const auto& prec : prec_values)
-        {
-            arg_strings[4] = "-prec=" + prec;
-
-            // Set up the argv array with pointers to the string data
-            for(size_t i = 0; i < ARG_COUNT; i++)
-            {
-                strncpy(args[i], arg_strings[i].c_str(), ARG_MAX_LEN);
-                argv[i] = args[i];
-            }
-
-            std::cout << "Arguments received: ";
-            for(size_t i = 1; i < ARG_COUNT; ++i)
-            {
-                std::cout << argv[i] << " ";
-            }
-            std::cout << std::endl;
-
-            // Call the function with the current configuration
-            try
-            {
-                is_success = run_gemm_test<GemmConfigDecode>(ARG_COUNT, argv) && is_success;
-            }
-            catch(const ArgumentsNotSupportedException& e)
-            {
-                std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
-                // ArgumentsNotSupportedException  is not an error. Do not change is_success
-            }
-            catch(const std::runtime_error& e)
-            {
-                std::cerr << "Caught runtime error: " << e.what() << '\n';
-                is_success = false;
-            }
-        }
-    }
-    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
-}
diff --git a/test/ck_tile/gemm_multi_abd/CMakeLists.txt b/test/ck_tile/gemm_multi_abd/CMakeLists.txt
new file mode 100644
index 0000000000..8f9b694a3b
--- /dev/null
+++ b/test/ck_tile/gemm_multi_abd/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Currently ck_tile is only built on gfx9
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+
+if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+    add_gtest_executable(test_ck_tile_gemm_multi_abd_cshuffle test_gemm_multi_abd_cshuffle.cpp)
+    add_gtest_executable(test_ck_tile_gemm_multi_abd_default2d test_gemm_multi_abd_default2d.cpp)
+    target_compile_definitions(test_ck_tile_gemm_multi_abd_cshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_definitions(test_ck_tile_gemm_multi_abd_default2d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+endif()
diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_cshuffle.cpp b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_cshuffle.cpp
new file mode 100644
index 0000000000..87d6a9101c
--- /dev/null
+++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_cshuffle.cpp
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_multi_abd_util.hpp"
+
+using F16  = ck_tile::half_t;
+using BF16 = ck_tile::bf16_t;
+using F32  = float;
+using F8   = ck_tile::fp8_t;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    // Has cshuffle epilogue enabled
+    //          A0Layout, A1Layout, B0Layout, B1Layout CLayout, D0Layout, D1Layout, A0DataType, A01DataType B0DataType, B0DataType, D0DataType,  D1DataType, AccDataType, EDataType, AElementWiseFn, BElementWiseFn, CDElementWiseFn, UseCshuffleEpilog
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          BF16,       BF16,       F32,      F16,          AddScale,       AddScale,    ElementWiseAddAdd, std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F32,        F32,        F32,      F16,          AddScale,       AddScale,    ElementWiseAddAdd, std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F32,        F32,        F32,      F16,          AddScale,       AddScale,    ElementWiseAddAdd, std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           BF16,       BF16,       F32,      F32,          AddScale,       AddScale,    ElementWiseAddAdd, std::true_type>,
+
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F16,        F16,        F32,      F16,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          BF16,       BF16,       F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F32,        F32,        F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F32,        F32,        F32,      F16,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           BF16,       BF16,       F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>
+
+    // Currently MultiABD kernel doesn't support F8 data type
+    //std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           F8,         F8,         F32,      F32,          AddScale,       AddScale,    ElementWiseAddAdd, std::true_type>,
+    //std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           F8,         F8,         F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileGemmMultiABD, KernelTypes);
+
+#include "test_gemm_multi_abd_ut_cases_cshuffle.inc"
diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_default2d.cpp b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_default2d.cpp
new file mode 100644
index 0000000000..f2476e803f
--- /dev/null
+++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_default2d.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_multi_abd_util.hpp"
+
+using F16  = ck_tile::half_t;
+using BF16 = ck_tile::bf16_t;
+using F32  = float;
+using F8   = ck_tile::fp8_t;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    // Has cshuffle epilogue disabled
+    //          A0Layout, A1Layout, B0Layout, B1Layout CLayout, D0Layout, D1Layout, A0DataType, A01DataType B0DataType, B0DataType, D0DataType,  D1DataType, AccDataType, EDataType, AElementWiseFn, BElementWiseFn, CDElementWiseFn, UseCshuffleEpilog
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F32,        F32,        F32,      F16,         AddScale,       AddScale,    ElementWiseAddAdd, std::false_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F32,        F32,        F32,      F16,         AddScale,       AddScale,    ElementWiseAddAdd, std::false_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F32,        F32,        F32,      F32,         AddScale,       AddScale,    ElementWiseAddAdd, std::false_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    ElementWiseAddAdd, std::false_type>,
+
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F16,        F16,        F32,      F16,         AddScale,       AddScale,    MultiplyMultiply,  std::false_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F32,        F32,        F32,      F16,         AddScale,       AddScale,    MultiplyMultiply,  std::false_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F32,        F32,        F32,      F32,         AddScale,       AddScale,    MultiplyMultiply,  std::false_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    MultiplyMultiply,  std::false_type>
+
+    // Currently MultiABD kernel doesn't support F8 data type
+    //std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,           F8,         F8,         F8,           BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    ElementWiseAddAdd, std::false_type>,
+    //std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,           F8,         F8,         F8,           BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    MultiplyMultiply,  std::false_type>,
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileGemmMultiABD, KernelTypes);
+
+#include "test_gemm_multi_abd_ut_cases_default2d.inc"
diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc
new file mode 100644
index 0000000000..33eb404fbe
--- /dev/null
+++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc
@@ -0,0 +1,111 @@
+#pragma once
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_512x512x512)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_256x512x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_512x256x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_512x512x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_256x256x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_512x768x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 768;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_512x1280x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 1280;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_256x1280x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 1280;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_768x512x256)
+{
+    constexpr int M      = 768;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_1280x512x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_1280x256x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_default2d.inc b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_default2d.inc
new file mode 100644
index 0000000000..cc7603164c
--- /dev/null
+++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_default2d.inc
@@ -0,0 +1,211 @@
+#pragma once
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_256x512x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_512x256x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 256;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_512x512x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_256x256x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 256;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_512x768x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 768;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_512x1280x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 1280;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_256x1280x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 1280;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_768x512x256)
+{
+    constexpr int M      = 768;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_1280x512x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch1Default_1280x256x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 256;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+
+    EXPECT_EQ(this->Run(M, N, K, kBatch), true);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_512x512x512)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_256x512x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_512x256x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_512x512x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_256x256x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_512x768x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 768;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_512x1280x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 1280;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_256x1280x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 1280;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_768x512x256)
+{
+    constexpr int M      = 768;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_1280x512x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
+
+TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2Default_1280x256x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+
+    EXPECT_THROW(this->Run(M, N, K, kBatch), std::runtime_error);
+}
diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_util.hpp b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_util.hpp
new file mode 100644
index 0000000000..428bed4e25
--- /dev/null
+++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_util.hpp
@@ -0,0 +1,500 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <sstream>
+#include <gtest/gtest.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+struct AddScale
+{
+    template <typename E, typename A0, typename A1>
+    CK_TILE_HOST_DEVICE constexpr void operator()(E& a, const A0& a0, const A1& a1) const
+    {
+        a = scale * (ck_tile::type_convert<float>(a0) + ck_tile::type_convert<float>(a1));
+    }
+
+    float scale = 1.0;
+};
+
+struct MultiplyMultiply
+{
+    template <typename E, typename C, typename D0, typename D1>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const D0& d0, const D1& d1) const -> void
+    {
+        const float x0_f = ck_tile::type_convert<float>(c) * ck_tile::type_convert<float>(d0) *
+                           ck_tile::type_convert<float>(d1);
+
+        e = ck_tile::type_convert<E>(x0_f);
+    }
+};
+
+struct ElementWiseAddAdd
+{
+    template <typename E, typename C, typename D0, typename D1>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const D0& d0, const D1& d1) const -> void
+    {
+        const float x0_f = ck_tile::type_convert<float>(c) + ck_tile::type_convert<float>(d0) +
+                           ck_tile::type_convert<float>(d1);
+
+        e = ck_tile::type_convert<E>(x0_f);
+    }
+};
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+template <typename A0DataType,
+          typename B0DataType,
+          typename AccDataType,
+          typename EDataType,
+          typename D0DataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(A0DataType) < sizeof(B0DataType), A0DataType, B0DataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+
+    const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+        max_accumulated_value, kbatch);
+
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename Tuple>
+class TestCkTileGemmMultiABD : public ::testing::Test
+{
+    protected:
+    using A0Layout          = std::tuple_element_t<0, Tuple>;
+    using A1Layout          = std::tuple_element_t<1, Tuple>;
+    using B0Layout          = std::tuple_element_t<2, Tuple>;
+    using B1Layout          = std::tuple_element_t<3, Tuple>;
+    using D0Layout          = std::tuple_element_t<4, Tuple>;
+    using D1Layout          = std::tuple_element_t<5, Tuple>;
+    using ELayout           = std::tuple_element_t<6, Tuple>;
+    using A0DataType        = std::tuple_element_t<7, Tuple>;
+    using A1DataType        = std::tuple_element_t<8, Tuple>;
+    using B0DataType        = std::tuple_element_t<9, Tuple>;
+    using B1DataType        = std::tuple_element_t<10, Tuple>;
+    using D0DataType        = std::tuple_element_t<11, Tuple>;
+    using D1DataType        = std::tuple_element_t<12, Tuple>;
+    using AccDataType       = std::tuple_element_t<13, Tuple>;
+    using EDataType         = std::tuple_element_t<14, Tuple>;
+    using AElementWiseFn    = std::tuple_element_t<15, Tuple>;
+    using BElementWiseFn    = std::tuple_element_t<16, Tuple>;
+    using CDElementWiseFn   = std::tuple_element_t<17, Tuple>;
+    using UseCshuffleEpilog = std::tuple_element_t<18, Tuple>;
+
+    using AsLayout   = ck_tile::tuple<A0Layout, A1Layout>;
+    using AsDataType = ck_tile::tuple<A0DataType, A1DataType>;
+    using BsLayout   = ck_tile::tuple<B0Layout, B1Layout>;
+    using BsDataType = ck_tile::tuple<B0DataType, B1DataType>;
+    using DsLayout   = ck_tile::tuple<D0Layout, D1Layout>;
+    using DsDataType = ck_tile::tuple<D0DataType, D1DataType>;
+
+    template <typename AsDataType,
+              typename BsDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename EDataType,
+              typename AsLayout,
+              typename BsLayout,
+              typename DsLayout,
+              typename ELayout,
+              typename AElementWise    = ck_tile::element_wise::PassThrough,
+              typename BElementWise    = ck_tile::element_wise::PassThrough,
+              typename CDElementWiseFn = ck_tile::element_wise::PassThrough>
+    void invoke_gemm_multi_abd(const ck_tile::GemmMultiABDHostArgs<AsDataType::size(),
+                                                                   BsDataType::size(),
+                                                                   DsDataType::size()>& args,
+                               const ck_tile::stream_config& s)
+    {
+        constexpr ck_tile::index_t M_Tile = 256;
+        constexpr ck_tile::index_t N_Tile = 256;
+        constexpr ck_tile::index_t K_Tile = 32;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 32;
+        constexpr ck_tile::index_t N_Warp_Tile = 32;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+        constexpr bool DoubleSmemBuffer = false;
+
+        constexpr bool kPadM = false;
+        constexpr bool kPadN = false;
+        constexpr bool kPadK = false;
+
+        constexpr bool TransposeC = false;
+
+        constexpr int kBlockPerCu                         = 1;
+        constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+        using TilePartitioner = ck_tile::
+            GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, AsLayout, BsLayout, ELayout>;
+        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                     kPadN,
+                                                                     kPadK,
+                                                                     DoubleSmemBuffer,
+                                                                     AsLayout,
+                                                                     BsLayout,
+                                                                     ELayout,
+                                                                     TransposeC>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<AsDataType, BsDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        float ave_time{0};
+
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = ck_tile::GemmPipelineScheduler::Intrawave;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<AsDataType,
+                                                                               BsDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v,
+                                                                               AElementWise,
+                                                                               BElementWise>;
+
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>;
+
+            using DefaultGemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
+                ck_tile::DefaultGemm2DEpilogueProblem<AsDataType,
+                                                      BsDataType,
+                                                      DsDataType,
+                                                      AccDataType,
+                                                      EDataType,
+                                                      DsLayout,
+                                                      ELayout,
+                                                      CDElementWiseFn,
+                                                      TilePartitioner::MPerBlock,
+                                                      TilePartitioner::NPerBlock,
+                                                      kPadM,
+                                                      kPadN,
+                                                      M_Warp_Tile,
+                                                      N_Warp_Tile,
+                                                      K_Warp_Tile,
+                                                      UniversalGemmProblem::TransposeC,
+                                                      true,
+                                                      memory_operation>>;
+
+            using CShuffleGemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<AsDataType,
+                                                 BsDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 EDataType,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 CDElementWiseFn,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+
+            using GemmEpilogue = std::
+                conditional_t<UseCshuffleEpilog::value, CShuffleGemmEpilogue, DefaultGemmEpilogue>;
+
+            using Kernel = ck_tile::GemmKernelMultiABD<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            return ave_time;
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                std::cout << "Run without SplitK" << std::endl;
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::set>{});
+            }
+            else
+            {
+                std::cout << "Run using SplitK" << std::endl;
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::atomic_add>{});
+            }
+        };
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    }
+
+    public:
+    bool Run(const int M,
+             const int N,
+             const int K,
+             const int k_batch,
+             int StrideA0 = 0,
+             int StrideA1 = 0,
+             int StrideB0 = 0,
+             int StrideB1 = 0,
+             int StrideD0 = 0,
+             int StrideD1 = 0,
+             int StrideE  = 0)
+    {
+        using namespace ck_tile::literals;
+
+        auto f_host_tensor_descriptor = [](std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        StrideA0 = f_get_default_stride(M, K, StrideA0, A0Layout{});
+        StrideA1 = f_get_default_stride(M, K, StrideA1, A1Layout{});
+
+        StrideB0 = f_get_default_stride(K, N, StrideB0, B0Layout{});
+        StrideB1 = f_get_default_stride(K, N, StrideB1, B1Layout{});
+
+        StrideD0 = f_get_default_stride(M, N, StrideD0, D0Layout{});
+        StrideD1 = f_get_default_stride(M, N, StrideD1, D1Layout{});
+
+        StrideE = f_get_default_stride(M, N, StrideE, ELayout{});
+
+        ck_tile::HostTensor<A0DataType> a0_m_k_tesnor(
+            f_host_tensor_descriptor(M, K, StrideA0, A0Layout{}));
+        ck_tile::HostTensor<A1DataType> a1_m_k_tesnor(
+            f_host_tensor_descriptor(M, K, StrideA1, A1Layout{}));
+
+        ck_tile::HostTensor<B0DataType> b0_k_n_tensors(
+            f_host_tensor_descriptor(K, N, StrideB0, B0Layout{}));
+        ck_tile::HostTensor<B1DataType> b1_k_n_tensors(
+            f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
+
+        ck_tile::HostTensor<D0DataType> d0_m_n_tensors(
+            f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+        ck_tile::HostTensor<D1DataType> d1_m_n_tensors(
+            f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+
+        ck_tile::HostTensor<EDataType> e_m_n_device_result(
+            f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+        ck_tile::FillUniformDistribution<A0DataType>{-1.f, 1.f}(a0_m_k_tesnor);
+        ck_tile::FillUniformDistribution<A0DataType>{-1.f, 1.f}(a1_m_k_tesnor);
+
+        ck_tile::FillUniformDistribution<B0DataType>{-1.f, 1.f}(b0_k_n_tensors);
+        ck_tile::FillUniformDistribution<B1DataType>{-1.f, 1.f}(b1_k_n_tensors);
+
+        ck_tile::FillUniformDistribution<D0DataType>{-1.f, 1.f}(d0_m_n_tensors);
+        ck_tile::FillUniformDistribution<D1DataType>{-1.f, 1.f}(d1_m_n_tensors);
+
+        ck_tile::DeviceMem a0_m_k_dev_buf(a0_m_k_tesnor.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem a1_m_k_dev_buf(a1_m_k_tesnor.get_element_space_size_in_bytes());
+
+        ck_tile::DeviceMem b0_k_n_dev_buf(b0_k_n_tensors.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b1_k_n_dev_buf(b1_k_n_tensors.get_element_space_size_in_bytes());
+
+        ck_tile::DeviceMem d0_m_n_dev_buf(d0_m_n_tensors.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d1_m_n_dev_buf(d1_m_n_tensors.get_element_space_size_in_bytes());
+
+        ck_tile::DeviceMem e_m_n_dev_buf(e_m_n_device_result.get_element_space_size_in_bytes());
+
+        a0_m_k_dev_buf.ToDevice(a0_m_k_tesnor.mData.data());
+        a1_m_k_dev_buf.ToDevice(a1_m_k_tesnor.mData.data());
+
+        b0_k_n_dev_buf.ToDevice(b0_k_n_tensors.mData.data());
+        b1_k_n_dev_buf.ToDevice(b1_k_n_tensors.mData.data());
+
+        d0_m_n_dev_buf.ToDevice(d0_m_n_tensors.mData.data());
+        d1_m_n_dev_buf.ToDevice(d1_m_n_tensors.mData.data());
+
+        e_m_n_dev_buf.SetZero();
+        e_m_n_device_result.SetZero();
+
+        std::array<const void*, DsDataType::size()> as_ptr_buf = {a0_m_k_dev_buf.GetDeviceBuffer(),
+                                                                  a1_m_k_dev_buf.GetDeviceBuffer()};
+
+        std::array<const void*, DsDataType::size()> bs_ptr_buf = {b0_k_n_dev_buf.GetDeviceBuffer(),
+                                                                  b1_k_n_dev_buf.GetDeviceBuffer()};
+
+        std::array<const void*, DsDataType::size()> ds_ptr_buf = {d0_m_n_dev_buf.GetDeviceBuffer(),
+                                                                  d1_m_n_dev_buf.GetDeviceBuffer()};
+
+        std::array<ck_tile::index_t, AsDataType::size()> strideAs = {StrideA0, StrideA1};
+        std::array<ck_tile::index_t, BsDataType::size()> strideBs = {StrideB0, StrideB1};
+        std::array<ck_tile::index_t, DsDataType::size()> strideDs = {StrideD0, StrideD1};
+
+        ck_tile::GemmMultiABDHostArgs<AsDataType::size(), BsDataType::size(), DsDataType::size()>
+            args({as_ptr_buf,
+                  bs_ptr_buf,
+                  ds_ptr_buf,
+                  e_m_n_dev_buf.GetDeviceBuffer(),
+                  k_batch,
+                  M,
+                  N,
+                  K,
+                  strideAs,
+                  strideBs,
+                  strideDs,
+                  StrideE});
+
+        invoke_gemm_multi_abd<AsDataType,
+                              BsDataType,
+                              DsDataType,
+                              AccDataType,
+                              EDataType,
+                              AsLayout,
+                              BsLayout,
+                              DsLayout,
+                              ELayout,
+                              AElementWiseFn,
+                              BElementWiseFn,
+                              CDElementWiseFn>(args, ck_tile::stream_config{nullptr, false});
+
+        std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K
+                  << " StrideA0 =" << StrideA0 << " StrideA1 =" << StrideA1
+                  << " StrideB0 =" << StrideB0 << " StrideB1 =" << StrideB1
+                  << " StrideE =" << StrideE << " StrideD0 =" << StrideD0
+                  << " StrideD1 =" << StrideD1 << std::endl;
+
+        e_m_n_dev_buf.FromDevice(e_m_n_device_result.data());
+        bool pass = true;
+
+        ck_tile::HostTensor<A0DataType> a_m_k_host_ref_element_result(
+            f_host_tensor_descriptor(M, K, StrideA0, A0Layout{}));
+        ck_tile::HostTensor<B0DataType> b_k_n_host_ref_element_result(
+            f_host_tensor_descriptor(K, N, StrideB0, B0Layout{}));
+        ck_tile::HostTensor<EDataType> e_m_n_host_ref(
+            f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+        a_m_k_host_ref_element_result.SetZero();
+        b_k_n_host_ref_element_result.SetZero();
+        e_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm_multiple_abd<AsDataType,
+                                             BsDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             EDataType,
+                                             AElementWiseFn,
+                                             BElementWiseFn,
+                                             CDElementWiseFn>({a0_m_k_tesnor, a1_m_k_tesnor},
+                                                              {b0_k_n_tensors, b1_k_n_tensors},
+                                                              {d0_m_n_tensors, d1_m_n_tensors},
+                                                              a_m_k_host_ref_element_result,
+                                                              b_k_n_host_ref_element_result,
+                                                              e_m_n_host_ref);
+
+        const float max_accumulated_value =
+            *std::max_element(e_m_n_host_ref.mData.begin(), e_m_n_host_ref.mData.end());
+        const auto rtol_atol =
+            calculate_rtol_atol<A0DataType, B0DataType, AccDataType, EDataType, D0DataType>(
+                K, k_batch, max_accumulated_value);
+        pass = ck_tile::check_err(e_m_n_device_result,
+                                  e_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+
+        return pass;
+    }
+};
diff --git a/test/ck_tile/gemm_multi_d/CMakeLists.txt b/test/ck_tile/gemm_multi_d/CMakeLists.txt
index c9d53e53e2..143fb9dc40 100644
--- a/test/ck_tile/gemm_multi_d/CMakeLists.txt
+++ b/test/ck_tile/gemm_multi_d/CMakeLists.txt
@@ -1,10 +1,9 @@
-# Currently ck_tile is only built on gfx9
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
     list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()
 
-if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     add_gtest_executable(test_gemm_multi_d_cshuffle test_gemm_multi_d_cshuffle.cpp)
     add_gtest_executable(test_gemm_multi_d_default2d test_gemm_multi_d_default2d.cpp)
     target_compile_definitions(test_gemm_multi_d_cshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc
index 8d21c65692..798bbb1116 100644
--- a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc
+++ b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #pragma once
 
 TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDKBatch1CShuffle_256x512x256)
diff --git a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
index 8399bc7ee3..f0050c15d5 100644
--- a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
+++ b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
@@ -86,7 +86,28 @@ class TestCkTileGemmMultiD : public ::testing::Test
     using DsLayout          = ck_tile::tuple<D0Layout, D1Layout>;
     using DsDataType        = ck_tile::tuple<D0DataType, D1DataType>;
 
-    template <typename ADataType,
+    struct GemmWarpConfig_Mfma
+    {
+        static constexpr ck_tile::index_t M_Tile      = 256;
+        static constexpr ck_tile::index_t N_Tile      = 256;
+        static constexpr ck_tile::index_t K_Tile      = 64;
+        static constexpr ck_tile::index_t M_Warp_Tile = 32;
+        static constexpr ck_tile::index_t N_Warp_Tile = 32;
+        static constexpr ck_tile::index_t K_Warp_Tile = 16;
+    };
+
+    struct GemmWarpConfig_Wmma
+    {
+        static constexpr ck_tile::index_t M_Tile      = 128;
+        static constexpr ck_tile::index_t N_Tile      = 128;
+        static constexpr ck_tile::index_t K_Tile      = 64;
+        static constexpr ck_tile::index_t M_Warp_Tile = 16;
+        static constexpr ck_tile::index_t N_Warp_Tile = 16;
+        static constexpr ck_tile::index_t K_Warp_Tile = 16;
+    };
+
+    template <typename GemmWarpConfig,
+              typename ADataType,
               typename BDataType,
               typename DsDataType,
               typename AccDataType,
@@ -99,17 +120,17 @@ class TestCkTileGemmMultiD : public ::testing::Test
     void invoke_gemm_multi_d(const ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args,
                              const ck_tile::stream_config& s)
     {
-        constexpr ck_tile::index_t M_Tile = 256;
-        constexpr ck_tile::index_t N_Tile = 256;
-        constexpr ck_tile::index_t K_Tile = 64;
+        constexpr ck_tile::index_t M_Tile = GemmWarpConfig::M_Tile;
+        constexpr ck_tile::index_t N_Tile = GemmWarpConfig::N_Tile;
+        constexpr ck_tile::index_t K_Tile = GemmWarpConfig::K_Tile;
 
         constexpr ck_tile::index_t M_Warp = 2;
         constexpr ck_tile::index_t N_Warp = 2;
         constexpr ck_tile::index_t K_Warp = 1;
 
-        constexpr ck_tile::index_t M_Warp_Tile = 32;
-        constexpr ck_tile::index_t N_Warp_Tile = 32;
-        constexpr ck_tile::index_t K_Warp_Tile = 16;
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
 
         constexpr bool DoubleSmemBuffer = false;
 
@@ -359,8 +380,9 @@ class TestCkTileGemmMultiD : public ::testing::Test
                                                               StrideB,
                                                               stridesDs,
                                                               StrideE});
-
-        invoke_gemm_multi_d<ADataType,
+#if CK_TILE_USE_WMMA
+        invoke_gemm_multi_d<GemmWarpConfig_Wmma,
+                            ADataType,
                             BDataType,
                             DsDataType,
                             AccDataType,
@@ -370,6 +392,19 @@ class TestCkTileGemmMultiD : public ::testing::Test
                             DsLayout,
                             ELayout,
                             CDElementWiseFn>(args, ck_tile::stream_config{nullptr, false});
+#else
+        invoke_gemm_multi_d<GemmWarpConfig_Mfma,
+                            ADataType,
+                            BDataType,
+                            DsDataType,
+                            AccDataType,
+                            EDataType,
+                            ALayout,
+                            BLayout,
+                            DsLayout,
+                            ELayout,
+                            CDElementWiseFn>(args, ck_tile::stream_config{nullptr, false});
+#endif
 
         std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K
                   << " StrideA =" << StrideA << " StrideB =" << StrideB << " StrideE =" << StrideE
diff --git a/test/ck_tile/gemm_streamk/CMakeLists.txt b/test/ck_tile/gemm_streamk/CMakeLists.txt
new file mode 100644
index 0000000000..e00874ba07
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Currently test_ck_tile_streamk is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    #TODO: support all arches
+    add_gtest_executable(test_ck_tile_streamk test_gemm_streamk.cpp)
+else()
+    message(DEBUG "Skipping test_ck_tile_streamk tests for current target")
+endif()
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk.cpp b/test/ck_tile/gemm_streamk/test_gemm_streamk.cpp
new file mode 100644
index 0000000000..99c3fb397f
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_gemm_streamk_types.hpp"
+#include "test_gemm_streamk_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTileStreamK
+
+TYPED_TEST_SUITE(TestCkTileStreamK, KernelTypesStreamK);
+
+#include "test_gemm_streamk_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
new file mode 100644
index 0000000000..1db7ef0fb0
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
@@ -0,0 +1,118 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_DP)
+{
+
+    ck_tile::index_t M     = 256;
+    ck_tile::index_t N     = 256;
+    ck_tile::index_t K     = 256;
+    uint32_t num_sk_blocks = 0;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks4)
+{
+
+    ck_tile::index_t M     = 256;
+    ck_tile::index_t N     = 256;
+    ck_tile::index_t K     = 256;
+    uint32_t num_sk_blocks = 4;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+// TODO: Renable this test once reduction is implemented
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks12)
+{
+    GTEST_SKIP() << "Skipping this test: There are precision issues with atomics due to >=3 WGs "
+                    "contributing to each macro tile in C";
+
+    ck_tile::index_t M     = 256;
+    ck_tile::index_t N     = 256;
+    ck_tile::index_t K     = 256;
+    uint32_t num_sk_blocks = 12;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks8)
+{
+
+    ck_tile::index_t M     = 256;
+    ck_tile::index_t N     = 256;
+    ck_tile::index_t K     = 256;
+    uint32_t num_sk_blocks = 8;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M512_N512_K512_DP)
+{
+
+    ck_tile::index_t M     = 512;
+    ck_tile::index_t N     = 512;
+    ck_tile::index_t K     = 512;
+    uint32_t num_sk_blocks = 0;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M512_N512_K512_SKBlocks16)
+{
+
+    ck_tile::index_t M     = 512;
+    ck_tile::index_t N     = 512;
+    ck_tile::index_t K     = 512;
+    uint32_t num_sk_blocks = 16;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M512_N512_K512_SKBlocks8)
+{
+
+    ck_tile::index_t M     = 512;
+    ck_tile::index_t N     = 512;
+    ck_tile::index_t K     = 512;
+    uint32_t num_sk_blocks = 8;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M3840_N4096_K4096_DP)
+{
+
+    ck_tile::index_t M     = 3840;
+    ck_tile::index_t N     = 4096;
+    ck_tile::index_t K     = 4096;
+    uint32_t num_sk_blocks = 0;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M3840_N4096_K4096_SKBlocks64)
+{
+
+    ck_tile::index_t M     = 3840;
+    ck_tile::index_t N     = 4096;
+    ck_tile::index_t K     = 4096;
+    uint32_t num_sk_blocks = 64;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_Unsupported_Reduction)
+{
+
+    ck_tile::index_t M     = 3840;
+    ck_tile::index_t N     = 4096;
+    ck_tile::index_t K     = 4096;
+    uint32_t num_sk_blocks = 64;
+
+    EXPECT_THROW(this->Run(M, N, K, num_sk_blocks, ck_tile::StreamKReductionStrategy::Reduction),
+                 std::runtime_error);
+}
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
new file mode 100644
index 0000000000..399f3f11e8
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
@@ -0,0 +1,25 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include <type_traits>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+
+using F16  = ck_tile::half_t;
+using F32  = float;
+using BF16 = ck_tile::bf16_t;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// clang-format off
+using KernelTypesStreamK = ::testing::Types<
+//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16>,
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16>
+>;
+
+// clang-format on
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
new file mode 100644
index 0000000000..b8a55b024d
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
@@ -0,0 +1,282 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <gtest/gtest.h>
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // The logic below may need to become more advanced once bugs in Stream-K Tile Partitioner are
+    // resolved. Because the number of WGs contributing to a macro tile in C may not be the same for
+    // all macro tiles in C.
+
+    // Calculate error due to more than 1 WG contributing to the same macro tile in C
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename Tuple>
+class TestCkTileStreamK : public ::testing::Test
+{
+    protected:
+    using ALayout     = std::tuple_element_t<0, Tuple>;
+    using BLayout     = std::tuple_element_t<1, Tuple>;
+    using CLayout     = std::tuple_element_t<2, Tuple>;
+    using ADataType   = std::tuple_element_t<3, Tuple>;
+    using BDataType   = std::tuple_element_t<4, Tuple>;
+    using AccDataType = std::tuple_element_t<5, Tuple>;
+    using CDataType   = std::tuple_element_t<6, Tuple>;
+    using DsLayout    = ck_tile::tuple<>;
+    using DsDataType  = ck_tile::tuple<>;
+
+    template <ck_tile::StreamKReductionStrategy ReductionStrategy,
+              bool PadM       = true,
+              bool PadN       = true,
+              bool PadK       = true,
+              bool Preshuffle = false,
+              bool TransposeC = false>
+    void invoke_streamk(const ck_tile::StreamKHostArgs& args,
+                        const ck_tile::stream_config& s,
+                        int num_cu,
+                        int occupancy)
+    {
+
+        constexpr ck_tile::index_t M_Tile = 128;
+        constexpr ck_tile::index_t N_Tile = 128;
+        constexpr ck_tile::index_t K_Tile = 32;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 32;
+        constexpr ck_tile::index_t N_Warp_Tile = 32;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+        constexpr bool kPadM      = PadM;
+        constexpr bool kPadN      = PadN;
+        constexpr bool kPadK      = PadK;
+        constexpr bool preshuffle = Preshuffle;
+
+        constexpr bool DoubleSmemBuffer   = false;
+        constexpr int kBlockPerCu         = 1;
+        constexpr bool StructuredSparsity = false;
+        constexpr bool NumWaveGroup       = 1;
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        using TilePartitioner = ck_tile::StreamKTilePartitioner<GemmShape, ReductionStrategy>;
+
+        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                     kPadN,
+                                                                     kPadK,
+                                                                     DoubleSmemBuffer,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     CLayout,
+                                                                     TransposeC,
+                                                                     StructuredSparsity,
+                                                                     false,
+                                                                     NumWaveGroup,
+                                                                     preshuffle>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+            constexpr auto scheduler        = ck_tile::GemmPipelineScheduler::Intrawave;
+
+            // We create the GEMM pipeline without specifying has_hot_loop or tail_num.
+            // This is because num_loop can vary (a) per WG and (b) per iteration of the Stream-K
+            // while loop. Instead, has_hot_loop and tail_num are determined in the Stream-K
+            // Kernel's RunGemm function. This is a similar pattern used by grouped GEMM.
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler>;
+            // For initial testing, we will just test with one pipeline.
+            // More extensive testing is coming later and will test other pipelines.
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<UniversalGemmProblem>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 ck_tile::tuple<>,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 ck_tile::tuple<>,
+                                                 CLayout,
+                                                 ck_tile::element_wise::PassThrough,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+
+            using Kernel = ck_tile::StreamKKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+
+            auto kargs = Kernel::MakeKernelArgs(args, num_cu, occupancy);
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                EXPECT_TRUE(false);
+            }
+
+            dim3 grid_dims  = Kernel::GridSize(kargs.tile_partitioner);
+            dim3 block_dims = Kernel::BlockSize();
+
+            ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grid_dims, block_dims, 0, kargs));
+        };
+
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       // Since we are doing stream K, in the case of
+                                       // atomics, multiple workgroups may write to the same
+                                       // output tile in the C tensor, so we must atomic add
+                                       // the results (not set)
+                                       ck_tile::memory_operation_enum::atomic_add>{});
+    }
+
+    public:
+    // Since Stream-K is build on gfx9, the lower bound for CUs is 104. Thus, we default num_cu to
+    // 104 and occupancy to 1 to ensure tests are reproducible on different architectures.
+    void Run(ck_tile::index_t M,
+             ck_tile::index_t N,
+             ck_tile::index_t K,
+             uint32_t num_sk_blocks = 0xffffffff,
+             ck_tile::StreamKReductionStrategy reduction_strategy =
+                 ck_tile::StreamKReductionStrategy::Atomic,
+             int occupancy             = 1,
+             int num_cu                = 104,
+             ck_tile::index_t stride_A = 0,
+             ck_tile::index_t stride_B = 0,
+             ck_tile::index_t stride_C = 0)
+    {
+
+        using namespace ck_tile::literals;
+
+        if(reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction)
+        {
+            throw std::runtime_error("Reduction Strategy is current unsupported!\n");
+        }
+
+        auto f_host_tensor_descriptor = [](std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        stride_A = f_get_default_stride(M, K, stride_A, ALayout{});
+        stride_B = f_get_default_stride(K, N, stride_B, BLayout{});
+        stride_C = f_get_default_stride(M, N, stride_C, CLayout{});
+
+        ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
+        ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+
+        ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5, /*seed*/ 11939}(a_m_k);
+        ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5, /*seed*/ 11940}(b_k_n);
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+
+        ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
+                                      b_k_n_dev_buf.GetDeviceBuffer(),
+                                      c_m_n_dev_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      stride_C,
+                                      reduction_strategy,
+                                      num_sk_blocks};
+
+        invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
+            args, ck_tile::stream_config{nullptr, false, 0, 0, 1}, num_cu, occupancy);
+
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, /*kbatch*/ 1, max_accumulated_value);
+
+        bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                       c_m_n_host_ref,
+                                       "Error: Incorrect results!",
+                                       rtol_atol.at(ck_tile::number<0>{}),
+                                       rtol_atol.at(ck_tile::number<1>{}));
+
+        EXPECT_TRUE(pass);
+    };
+};
diff --git a/test/ck_tile/gemm_weight_preshuffle/CMakeLists.txt b/test/ck_tile/gemm_weight_preshuffle/CMakeLists.txt
index 4b9e6049e3..90803bd9d5 100644
--- a/test/ck_tile/gemm_weight_preshuffle/CMakeLists.txt
+++ b/test/ck_tile/gemm_weight_preshuffle/CMakeLists.txt
@@ -12,7 +12,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
     -enable-noalias-to-md-conversion=0
 )
 
-if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_gemm_pipeline_wp test_gemm_pipeline_wp.cpp)
 
     target_compile_options(test_ck_tile_gemm_pipeline_wp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp
index f66f3cb0aa..ed1b1e32ab 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp
@@ -13,6 +13,7 @@ using F16  = ck_tile::half_t;
 using F32  = float;
 using F8   = ck_tile::fp8_t;
 using BF16 = ck_tile::bf16_t;
+using I4   = ck_tile::pk_int4_t;
 
 using Row = ck_tile::tensor_layout::gemm::RowMajor;
 using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
@@ -20,19 +21,25 @@ using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
 using Default = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
                                            ck_tile::GemmPipelineScheduler::Default>;
 
-using WeightPreshuffle =
-    ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::WeightPreshuffle>;
-
-// Adding alias for the F8 parameters to facilitate skipping tests.
-// This alias can be removed once test failures are fixed.
-using F8Types = std::tuple<Row, Col, Row, F8, F8, F32, F16, Default, WeightPreshuffle>;
+using WeightPreshuffleV1 =
+    ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::WeightPreshuffleV1>;
+using WeightPreshuffleV2 =
+    ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::WeightPreshuffleV2>;
 
 // clang-format off
 
 using KernelTypesWeightPreshuffle = ::testing::Types<
-     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Default,        WeightPreshuffle>,
-     std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,      BF16,             Default,        WeightPreshuffle>,
-     F8Types
+     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Default,        WeightPreshuffleV1>,
+     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Default,        WeightPreshuffleV2>,
+      std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,            Default,        WeightPreshuffleV2>,
+     std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,            Default,        WeightPreshuffleV1>
+#if !CK_TILE_USE_WMMA || CK_TILE_USE_OCP_FP8
+     ,
+     std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,             Default,        WeightPreshuffleV1>,
+     std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,             Default,        WeightPreshuffleV2>,
+     std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,             Default,        WeightPreshuffleV2>,
+     std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,             Default,        WeightPreshuffleV1>
+#endif     
      >;
 
 // clang-format on
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
index 389e0d53ea..bb56c63413 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
@@ -20,7 +20,7 @@ TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle)
 
 TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x128x128)
 {
-    if constexpr(std::is_same_v<TypeParam, F8Types>)
+    if constexpr(std::is_same_v<std::tuple_element_t<3, TypeParam>, F8>)
     {
         GTEST_SKIP() << "Skipping this test due to failures with F8";
     }
@@ -48,7 +48,7 @@ TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x128x4096)
 
 TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x2048x128)
 {
-    if constexpr(std::is_same_v<TypeParam, F8Types>)
+    if constexpr(std::is_same_v<std::tuple_element_t<3, TypeParam>, F8>)
     {
         GTEST_SKIP() << "Skipping this test due to failures with F8";
     }
@@ -77,7 +77,7 @@ TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x2048x4096)
 
 TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x128x128)
 {
-    if constexpr(std::is_same_v<TypeParam, F8Types>)
+    if constexpr(std::is_same_v<std::tuple_element_t<3, TypeParam>, F8>)
     {
         GTEST_SKIP() << "Skipping this test due to failures with F8";
     }
@@ -106,7 +106,7 @@ TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x128x4096)
 
 TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x2048x128)
 {
-    if constexpr(std::is_same_v<TypeParam, F8Types>)
+    if constexpr(std::is_same_v<std::tuple_element_t<3, TypeParam>, F8>)
     {
         GTEST_SKIP() << "Skipping this test due to failures with F8";
     }
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
index 5d52f15696..22d83306c3 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/host/permute_pk_int4.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 
@@ -34,20 +35,31 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
 
 enum struct GemmPipelineType
 {
-    WeightPreshuffle
+    WeightPreshuffleV1,
+    WeightPreshuffleV2
 };
 
 template <GemmPipelineType PT, typename Problem>
 struct GemmPipelineTypeSelector;
 
 template <typename Problem>
-struct GemmPipelineTypeSelector<GemmPipelineType::WeightPreshuffle, Problem>
+struct GemmPipelineTypeSelector<GemmPipelineType::WeightPreshuffleV1, Problem>
 {
     using base_pipeline = ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV1<Problem>;
     using pipeline      = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV1<Problem>;
 
-    static constexpr auto GetName() { return "GemmPipelineAgBgCrWeightPreshuffle"; }
+    static constexpr auto GetName() { return "GemmPipelineAgBgCrWeightPreshuffleV1"; }
 };
+
+template <typename Problem>
+struct GemmPipelineTypeSelector<GemmPipelineType::WeightPreshuffleV2, Problem>
+{
+    using base_pipeline = ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<Problem>;
+    using pipeline      = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV2<Problem>;
+
+    static constexpr auto GetName() { return "GemmPipelineAgBgCrWeightPreshuffleV2"; }
+};
+
 template <typename Datatype>
 struct config
 {
@@ -63,6 +75,23 @@ struct config
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
     static constexpr ck_tile::index_t K_Warp_Tile = sizeof(Datatype) == 2 ? 16 : 32;
 };
+
+template <typename Datatype>
+struct config_wmma
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(Datatype);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+};
+
 template <typename Tuple>
 class TestCkTileGemmPipeline : public ::testing::Test
 {
@@ -79,34 +108,21 @@ class TestCkTileGemmPipeline : public ::testing::Test
 
     using DsLayout   = ck_tile::tuple<>;
     using DsDataType = ck_tile::tuple<>;
-    using GemmConfig = config<ADataType>;
 
     static constexpr bool Persistent =
         ck_tile::tuple_element_or_default_t<Tuple, 9, std::false_type>::value;
     // TODO: expose tile size through test t-param ?
 
-    template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
+    template <typename GemmConfig, bool PadM, bool PadN, bool PadK, bool Preshuffle>
     void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     {
-        // TODO: This should be parameterized in tests
-        // constexpr ck_tile::index_t M_Tile = 128;
-        // constexpr ck_tile::index_t N_Tile = 128;
-        // constexpr ck_tile::index_t K_Tile = 128;
-
-        // constexpr ck_tile::index_t M_Warp = 1;
-        // constexpr ck_tile::index_t N_Warp = 4;
-        // constexpr ck_tile::index_t K_Warp = 1;
-
-        // constexpr ck_tile::index_t M_Warp_Tile = 32;
-        // constexpr ck_tile::index_t N_Warp_Tile = 32;
-        // constexpr ck_tile::index_t K_Warp_Tile = sizeof(ADataType) == 2 ? 16 : 32;
-
         constexpr bool kPadM      = PadM;
         constexpr bool kPadN      = PadN;
         constexpr bool kPadK      = PadK;
         constexpr bool preshuffle = Preshuffle;
 
-        constexpr bool DoubleSmemBuffer = false;
+        constexpr bool DoubleSmemBuffer =
+            (PipelineType == GemmPipelineType::WeightPreshuffleV2) ? true : false;
 
         // TODO: For now - but this should also be a test parameter
         constexpr bool TransposeC = false;
@@ -253,6 +269,48 @@ class TestCkTileGemmPipeline : public ::testing::Test
         k_batches_ = {1};
     }
 
+    template <typename GemmConfig, typename T>
+    auto shuffle_b(const ck_tile::HostTensor<T>& t)
+    {
+        assert(t.get_lengths().size() == 2);
+        int n_ = t.get_lengths()[1];
+        int k_ = t.get_lengths()[0];
+
+        if(ck_tile::is_gfx12_supported())
+        {
+            constexpr int divisor      = 2;
+            constexpr int kABK1PerLane = 8;
+            constexpr int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
+            ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                           GemmConfig::N_Warp_Tile,
+                                           k_ / GemmConfig::K_Warp_Tile,
+                                           kABK0PerLane,
+                                           divisor,
+                                           kABK1PerLane});
+            std::copy(t.begin(), t.end(), t_view.begin());
+            return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
+        }
+        else
+        {
+            int divisor = 1;
+            if(ck_tile::is_gfx11_supported())
+            {
+                divisor = 1;
+            }
+            else
+            {
+                assert(is_wave32() == false);
+                divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+            }
+            ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                           GemmConfig::N_Warp_Tile,
+                                           k_ / GemmConfig::K_Warp_Tile,
+                                           divisor,
+                                           GemmConfig::K_Warp_Tile / divisor});
+            std::copy(t.begin(), t.end(), t_view.begin());
+            return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+        }
+    }
     template <bool PadM = true, bool PadN = true, bool PadK = true, bool Preshuffle = false>
     void Run(const int M,
              const int N,
@@ -263,11 +321,17 @@ class TestCkTileGemmPipeline : public ::testing::Test
     {
         for(auto kb : k_batches_)
         {
-            RunSingle<PadM, PadN, PadK, Preshuffle>(M, N, K, StrideA, StrideB, StrideC, kb);
+#if CK_TILE_USE_WMMA
+            RunSingle<config_wmma<ADataType>, PadM, PadN, PadK, Preshuffle>(
+                M, N, K, StrideA, StrideB, StrideC, kb);
+#else
+            RunSingle<config<ADataType>, PadM, PadN, PadK, Preshuffle>(
+                M, N, K, StrideA, StrideB, StrideC, kb);
+#endif
         }
     }
 
-    template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
+    template <typename GemmConfig, bool PadM, bool PadN, bool PadK, bool Preshuffle>
     void RunSingle(const int M,
                    const int N,
                    const int K,
@@ -327,19 +391,19 @@ class TestCkTileGemmPipeline : public ::testing::Test
         ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
         ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
 
-        constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
-        ck_tile::HostTensor<BDataType> t_view({N / GemmConfig::N_Warp_Tile,
-                                               GemmConfig::N_Warp_Tile,
-                                               K / GemmConfig::K_Warp_Tile,
-                                               divisor,
-                                               GemmConfig::K_Warp_Tile / divisor});
-
-        std::copy(b_k_n.begin(), b_k_n.end(), t_view.begin());
-        ck_tile::HostTensor<BDataType> b_shuffle_host =
-            ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
-
         a_m_k_dev_buf.ToDevice(a_m_k.data());
-        b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
+        ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n);
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_shuffle_host_dev = b_shuffle_host;
+            ck_tile::permute_vectors_i4x4_b(b_shuffle_host_dev);
+            b_k_n_dev_buf.ToDevice(b_shuffle_host_dev.data());
+        }
+        else
+        {
+            b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
+        }
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
@@ -354,7 +418,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                    stride_B,
                                    stride_C};
 
-        invoke_gemm<PadM, PadN, PadK, Preshuffle>(args, ck_tile::stream_config{nullptr, false});
+        invoke_gemm<GemmConfig, PadM, PadN, PadK, Preshuffle>(
+            args, ck_tile::stream_config{nullptr, false});
 
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
         bool pass = true;
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp
index de71c4682d..d836c501ae 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "test_gemm_pipeline_kernel_types.hpp"
 #include "test_gemm_pipeline_util.hpp"
 #include "gtest/gtest.h"
diff --git a/test/ck_tile/grouped_gemm/CMakeLists.txt b/test/ck_tile/grouped_gemm/CMakeLists.txt
index f4845847f1..4fd5c82ae9 100644
--- a/test/ck_tile/grouped_gemm/CMakeLists.txt
+++ b/test/ck_tile/grouped_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
 # Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_grouped_gemm test_grouped_gemm.cpp)
 endif()
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index 51f51649d5..0e79aff700 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -31,7 +31,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
     using PersistentType             = std::tuple_element_t<7, Tuple>;
     static constexpr bool Persistent = PersistentType::value;
 
-    struct GroupedGemKernelParam
+    struct GroupedGemKernelParam_Mfma
     {
         static const bool kPadM = false;
         static const bool kPadN = false;
@@ -54,13 +54,24 @@ class TestCkTileGroupedGemm : public ::testing::Test
         static const ck_tile::index_t K_Warp_Tile = 16;
     };
 
-    using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
+    struct GroupedGemKernelParam_Wmma : public GroupedGemKernelParam_Mfma
+    {
+        static const ck_tile::index_t M_Tile = 128;
+        static const ck_tile::index_t N_Tile = 128;
+        static const ck_tile::index_t K_Tile = 64;
+
+        static const ck_tile::index_t M_Warp_Tile = 16;
+        static const ck_tile::index_t N_Warp_Tile = 16;
+        static const ck_tile::index_t K_Warp_Tile = 16;
+    };
+
+    using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs<>;
     std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
     {
-        return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
+        return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg<>);
     }
 
-    template <typename ALayout, typename BLayout, typename CLayout>
+    template <typename GroupedGemKernelParam, typename ALayout, typename BLayout, typename CLayout>
     void invoke_grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                              const ck_tile::stream_config& s,
                              void* kargs_ptr)
@@ -203,7 +214,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
         BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
     }
 
-    template <typename ALayout, typename BLayout, typename CLayout>
+    template <typename GroupedGemKernelParam, typename ALayout, typename BLayout, typename CLayout>
     void invoke_grouped_gemm_persistent(const ck_tile::stream_config& s,
                                         const ck_tile::index_t num_groups,
                                         void* kargs_ptr,
@@ -431,8 +442,18 @@ class TestCkTileGroupedGemm : public ::testing::Test
             // TODO add support for kbatch > 1
             static constexpr ck_tile::index_t k_batch = 1;
 
-            gemm_descs.push_back(
-                {p_a, p_b, p_c, k_batch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+            gemm_descs.push_back({p_a,
+                                  p_b,
+                                  {/*ds_ptr*/},
+                                  p_c,
+                                  kbatch,
+                                  M,
+                                  N,
+                                  K,
+                                  stride_As[i],
+                                  stride_Bs[i],
+                                  {/*stride_Ds*/},
+                                  stride_Cs[i]});
         }
 
         ck_tile::DeviceMem gemm_workspace;
@@ -441,7 +462,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
         if constexpr(Persistent)
         {
             // Generate kernel arguments
-            std::vector<ck_tile::GemmTransKernelArg> kargs;
+            std::vector<ck_tile::GemmTransKernelArg<>> kargs;
             void* kargs_ptr   = gemm_workspace.GetDeviceBuffer();
             const bool splitk = gemm_descs[0].k_batch > 1;
             for(const auto& arg : gemm_descs)
@@ -463,18 +484,30 @@ class TestCkTileGroupedGemm : public ::testing::Test
             ck_tile::hip_check_error(
                 hipMemcpyWithStream(kargs_ptr,
                                     kargs.data(),
-                                    kargs.size() * sizeof(ck_tile::GemmTransKernelArg),
+                                    kargs.size() * sizeof(ck_tile::GemmTransKernelArg<>),
                                     hipMemcpyHostToDevice,
                                     stream.stream_id_));
-            invoke_grouped_gemm_persistent<ALayout, BLayout, CLayout>(
+#if CK_TILE_USE_WMMA
+            invoke_grouped_gemm_persistent<GroupedGemKernelParam_Wmma, ALayout, BLayout, CLayout>(
                 stream, group_count, kargs_ptr, splitk);
+#else
+            invoke_grouped_gemm_persistent<GroupedGemKernelParam_Mfma, ALayout, BLayout, CLayout>(
+                stream, group_count, kargs_ptr, splitk);
+#endif
         }
         else
         {
-            invoke_grouped_gemm<ALayout, BLayout, CLayout>(
+#if CK_TILE_USE_WMMA
+            invoke_grouped_gemm<GroupedGemKernelParam_Wmma, ALayout, BLayout, CLayout>(
                 gemm_descs,
                 ck_tile::stream_config{nullptr, false, 1},
                 gemm_workspace.GetDeviceBuffer());
+#else
+            invoke_grouped_gemm<GroupedGemKernelParam_Mfma, ALayout, BLayout, CLayout>(
+                gemm_descs,
+                ck_tile::stream_config{nullptr, false, 1},
+                gemm_workspace.GetDeviceBuffer());
+#endif
         }
 
         // Copy results back to host for validation
diff --git a/test/ck_tile/grouped_gemm_multi_d/CMakeLists.txt b/test/ck_tile/grouped_gemm_multi_d/CMakeLists.txt
new file mode 100644
index 0000000000..20c4cbc1c3
--- /dev/null
+++ b/test/ck_tile/grouped_gemm_multi_d/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+
+if(GPU_TARGETS MATCHES "gfx94|gfx95")
+    add_gtest_executable(test_ck_tile_grouped_gemm_multi_d test_grouped_gemm_multi_d.cpp)
+    target_compile_options(test_ck_tile_grouped_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+endif()
\ No newline at end of file
diff --git a/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d.cpp b/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d.cpp
new file mode 100644
index 0000000000..deea2fc852
--- /dev/null
+++ b/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include <gtest/gtest.h>
+
+#include "ck_tile/host.hpp"
+#include "test_grouped_gemm_multi_d_util.hpp"
+
+using F16 = ck_tile::half_t;
+using F8  = ck_tile::fp8_t;
+using F32 = float;
+
+// Custom tuple-like structure for kernel configuration
+template <typename ALayout_,
+          typename BLayout_,
+          typename ELayout_,
+          typename ADataType_,
+          typename BDataType_,
+          typename AccDataType_,
+          typename EDataType_,
+          int M_Tile_val_,
+          int N_Tile_val_,
+          int K_Tile_val_,
+          int M_Warp_val_,
+          int N_Warp_val_,
+          int K_Warp_val_,
+          int M_Warp_Tile_val_,
+          int N_Warp_Tile_val_,
+          int K_Warp_Tile_val_,
+          bool DoubleSmemBuffer_val_,
+          ck_tile::GemmPipelineScheduler Scheduler_val_,
+          PipelineType Pipeline_val_>
+struct KernelConfig
+{
+    using ALayoutType  = ALayout_;
+    using BLayoutType  = BLayout_;
+    using ELayoutType  = ELayout_;
+    using DsLayoutType = ck_tile::tuple<Row, Row>;
+    using ADataType    = ADataType_;
+    using BDataType    = BDataType_;
+    using AccDataType  = AccDataType_;
+    using EDataType    = EDataType_;
+    using DsDataType   = ck_tile::tuple<F16, F16>;
+
+    static constexpr int M_Tile_            = M_Tile_val_;
+    static constexpr int N_Tile_            = N_Tile_val_;
+    static constexpr int K_Tile_            = K_Tile_val_;
+    static constexpr int M_Warp_            = M_Warp_val_;
+    static constexpr int N_Warp_            = N_Warp_val_;
+    static constexpr int K_Warp_            = K_Warp_val_;
+    static constexpr int M_Warp_Tile_       = M_Warp_Tile_val_;
+    static constexpr int N_Warp_Tile_       = N_Warp_Tile_val_;
+    static constexpr int K_Warp_Tile_       = K_Warp_Tile_val_;
+    static constexpr bool DoubleSmemBuffer_ = DoubleSmemBuffer_val_;
+    static constexpr auto Scheduler_        = Scheduler_val_;
+    static constexpr PipelineType Pipeline_ = Pipeline_val_;
+    static constexpr int BlockPerCu_        = 1;
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //             ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, EDataType, M_N_KTiles,    M_N_K_Warps,     M_N_K_Warp_Tile, DoubleSmemBuffer, Scheduler, Pipeline
+    KernelConfig<    Row,     Col,     Row,         F16,       F16,         F32,       F16,  128, 32, 64,    4, 1, 1,       32, 32, 8,        false,           ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory>, // memory
+    KernelConfig<    Row,     Col,     Row,         F16,       F16,         F32,       F16,  256, 256, 64,   2, 2, 1,       32, 32, 16,       false,           ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3>, // v3
+    KernelConfig<    Row,     Col,     Row,         F16,       F16,         F32,       F16,  256, 256, 32,   2, 2, 1,       32, 32, 16,       true,            ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4> // v4
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileGroupedGemmMultiD, KernelTypes);
+
+#include "test_grouped_gemm_multi_d_ut_cases.inc"
diff --git a/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d_ut_cases.inc b/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d_ut_cases.inc
new file mode 100644
index 0000000000..9c3a33cf59
--- /dev/null
+++ b/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d_ut_cases.inc
@@ -0,0 +1,91 @@
+#pragma once
+
+TYPED_TEST(TestCkTileGroupedGemmMultiD, K256)
+{
+    const int group_count = 7;
+    const int kbatch      = 1;
+    std::vector<int> Ms;
+    std::vector<int> Ns;
+    std::vector<int> Ks;
+    std::vector<int> stride_As;
+    std::vector<int> stride_Bs;
+    std::vector<int> stride_Es;
+    std::vector<int> stride_D0;
+    std::vector<int> stride_D1;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        Ms.push_back(256 + 256 * i);
+        Ns.push_back(256 + 512 * i);
+        Ks.push_back(512 + 256 * i);
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Es.push_back(Ns[i]);
+        stride_D0.push_back(Ns[i]);
+        stride_D1.push_back(Ns[i]);
+    }
+
+    this->Run(
+        Ms, Ns, Ks, stride_As, stride_Bs, stride_Es, stride_D0, stride_D1, kbatch, group_count);
+}
+
+TYPED_TEST(TestCkTileGroupedGemmMultiD, K128)
+{
+    const int group_count = 5;
+    const int kbatch      = 1;
+    std::vector<int> Ms;
+    std::vector<int> Ns;
+    std::vector<int> Ks;
+    std::vector<int> stride_As;
+    std::vector<int> stride_Bs;
+    std::vector<int> stride_Es;
+    std::vector<int> stride_D0;
+    std::vector<int> stride_D1;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        Ms.push_back(256 + 256 * i);
+        Ns.push_back(256 + 512 * i);
+        Ks.push_back(512 + 128 * i);
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Es.push_back(Ns[i]);
+        stride_D0.push_back(Ns[i]);
+        stride_D1.push_back(Ns[i]);
+    }
+
+    this->Run(
+        Ms, Ns, Ks, stride_As, stride_Bs, stride_Es, stride_D0, stride_D1, kbatch, group_count);
+}
+
+TYPED_TEST(TestCkTileGroupedGemmMultiD, LargeMNK_8Groups)
+{
+    const int group_count = 8;
+    const int kbatch      = 1;
+    std::vector<int> Ms;
+    std::vector<int> Ns;
+    std::vector<int> Ks;
+    std::vector<int> stride_As;
+    std::vector<int> stride_Bs;
+    std::vector<int> stride_Es;
+    std::vector<int> stride_D0;
+    std::vector<int> stride_D1;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        Ms.push_back(512 + 256 * i);
+        Ns.push_back(512 + 256 * i);
+        Ks.push_back(768 + 256 * i);
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Es.push_back(Ns[i]);
+        stride_D0.push_back(Ns[i]);
+        stride_D1.push_back(Ns[i]);
+    }
+
+    this->Run(
+        Ms, Ns, Ks, stride_As, stride_Bs, stride_Es, stride_D0, stride_D1, kbatch, group_count);
+}
diff --git a/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d_util.hpp b/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d_util.hpp
new file mode 100644
index 0000000000..4c13b4a7f7
--- /dev/null
+++ b/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d_util.hpp
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+
+enum class PipelineType
+{
+    Memory = 0,
+    CompV3 = 1,
+    CompV4 = 2
+};
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+struct MultiplyMultiply
+{
+    template <typename E, typename C, typename D0, typename D1>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const D0& d0, const D1& d1) const -> void
+    {
+        const float x0_f = ck_tile::type_convert<float>(c) * ck_tile::type_convert<float>(d0) *
+                           ck_tile::type_convert<float>(d1);
+
+        e = ck_tile::type_convert<E>(x0_f);
+    }
+};
+
+template <typename Config>
+class TestCkTileGroupedGemmMultiD : public ::testing::Test
+{
+    protected:
+    using ALayout     = typename Config::ALayoutType;
+    using BLayout     = typename Config::BLayoutType;
+    using ELayout     = typename Config::ELayoutType;
+    using DsLayout    = typename Config::DsLayoutType;
+    using ADataType   = typename Config::ADataType;
+    using BDataType   = typename Config::BDataType;
+    using AccDataType = typename Config::AccDataType;
+    using EDataType   = typename Config::EDataType;
+    using PrecType    = BDataType;
+    using DsDataType  = typename Config::DsDataType;
+    using D0DataType  = std::tuple_element_t<0, DsDataType>;
+    using D1DataType  = std::tuple_element_t<1, DsDataType>;
+    using D0Layout    = std::tuple_element_t<0, DsLayout>;
+    using D1Layout    = std::tuple_element_t<1, DsLayout>;
+
+    static const bool kPadM = false;
+    static const bool kPadN = false;
+    static const bool kPadK = false;
+
+    static constexpr bool TransposeC = false; // transpose c is not supported
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+    auto calculate_rtol_atol(const ck_tile::index_t K,
+                             const ck_tile::index_t kbatch,
+                             const float max_accumulated_value)
+    {
+        using ComputeTypeAB =
+            std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+
+        using ComputeType = std::
+            conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+        // Calculate thresholds
+        const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+            ck_tile::integer_divide_ceil(K, kbatch));
+
+        const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+            max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+        // Calculate error due to split_k accumulation
+        const auto rtol_split_k =
+            ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+
+        const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+            max_accumulated_value, kbatch);
+
+        // Use higher threshold
+        return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+    }
+
+    using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs<DsDataType::size()>;
+    inline std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
+    {
+        return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg<DsDataType::size()>);
+    }
+
+    template <typename ALayout, typename BLayout, typename ELayout>
+    void invoke_grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
+                             const ck_tile::stream_config& s,
+                             void* kargs_ptr)
+    {
+
+        using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<Config::M_Tile_, Config::N_Tile_, Config::K_Tile_>,
+            ck_tile::sequence<Config::M_Warp_, Config::N_Warp_, Config::K_Warp_>,
+            ck_tile::sequence<Config::M_Warp_Tile_, Config::N_Warp_Tile_, Config::K_Warp_Tile_>>;
+        using TilePartitioner = ck_tile::
+            GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, ELayout>;
+
+        // for testing purposes, we can hardcode the values here as we what is compatible with
+        // pipeline
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<kPadM,
+                                             kPadN,
+                                             kPadK,
+                                             Config::DoubleSmemBuffer_,
+                                             ALayout,
+                                             BLayout,
+                                             ELayout,
+                                             TransposeC,
+                                             /*UseStructuredSparsity*/ false,
+                                             /*Persistent*/ false,
+                                             /*NumWaveGroups*/ 1,
+                                             /*Preshuffle*/ false>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain = gemm_descs[0].k_batch * Config::K_Tile_;
+        const ck_tile::index_t K_split =
+            (gemm_descs[0].K + k_grain - 1) / k_grain * Config::K_Tile_;
+        const ck_tile::index_t num_loop =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       TileParitionerGroupNum,
+                                                       TileParitionerM01>::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        float ave_time{0};
+
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto memory_operation = memory_operation_.value;
+            using UniversalGemmProblem      = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                                    BDataType,
+                                                                                    AccDataType,
+                                                                                    GemmShape,
+                                                                                    GemmUniversalTraits,
+                                                                                    Config::Scheduler_,
+                                                                                    has_hot_loop_v,
+                                                                                    tail_number_v>;
+
+            using GemmPipeline = std::conditional_t<
+                Config::Pipeline_ == (PipelineType::Memory),
+                ck_tile::GemmPipelineAgBgCrMem<UniversalGemmProblem>,
+                std::conditional_t<Config::Pipeline_ == (PipelineType::CompV3),
+                                   ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>,
+                                   ck_tile::GemmPipelineAgBgCrCompV4<UniversalGemmProblem>>>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 EDataType,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 MultiplyMultiply,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 Config::M_Warp_,
+                                                 Config::N_Warp_,
+                                                 Config::M_Warp_Tile_,
+                                                 Config::N_Warp_Tile_,
+                                                 Config::K_Warp_Tile_,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+            using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKargs(gemm_descs);
+            EXPECT_TRUE(Kernel::IsSupportedArgument(kargs));
+            const dim3 grids  = Kernel::GridSize(gemm_descs);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel: " << Kernel::GetName()
+                          << " with args:" << " grid: {" << grids.x << ", " << grids.y << ", "
+                          << grids.z << "}" << ", blocks: {" << blocks.x << ", " << blocks.y << ", "
+                          << blocks.z << "}" << std::endl;
+            }
+
+            ck_tile::hip_check_error(hipMemcpyWithStream(kargs_ptr,
+                                                         kargs.data(),
+                                                         get_workspace_size(gemm_descs),
+                                                         hipMemcpyHostToDevice,
+                                                         s.stream_id_));
+
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<Config::BlockPerCu_>(
+                    Kernel{},
+                    grids,
+                    blocks,
+                    0,
+                    ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                    gemm_descs.size()));
+            return ave_time;
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(gemm_descs[0].k_batch == 1)
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::set>{});
+            }
+            else
+            {
+                // EXPECT TO FAIL because splitk is not supported
+                EXPECT_FALSE(true);
+            }
+        };
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    }
+
+    public:
+    void Run(const std::vector<int>& Ms,
+             const std::vector<int>& Ns,
+             const std::vector<int>& Ks,
+             std::vector<int>& stride_As,
+             std::vector<int>& stride_Bs,
+             std::vector<int>& stride_Es,
+             std::vector<int>& stride_D0,
+             std::vector<int>& stride_D1,
+             const int kbatch      = 1,
+             const int group_count = 16)
+    {
+
+        using namespace ck_tile::literals;
+        auto f_host_tensor_descriptor = [](std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
+        std::vector<ck_tile::HostTensor<BDataType>> b_k_n_tensors;
+        std::vector<ck_tile::HostTensor<EDataType>> e_m_n_tensors;
+        std::vector<ck_tile::HostTensor<D0DataType>> d0_m_n_tensors;
+        std::vector<ck_tile::HostTensor<D1DataType>> d1_m_n_tensors;
+
+        a_m_k_tensors.reserve(group_count);
+        b_k_n_tensors.reserve(group_count);
+        e_m_n_tensors.reserve(group_count);
+        d0_m_n_tensors.reserve(group_count);
+        d1_m_n_tensors.reserve(group_count);
+
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> a_m_k_dev_buf;
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> b_k_n_dev_buf;
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> e_m_n_dev_buf;
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> d0_m_n_dev_buf;
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> d1_m_n_dev_buf;
+
+        a_m_k_dev_buf.reserve(group_count);
+        b_k_n_dev_buf.reserve(group_count);
+        e_m_n_dev_buf.reserve(group_count);
+        d0_m_n_dev_buf.reserve(group_count);
+        d1_m_n_dev_buf.reserve(group_count);
+
+        std::vector<grouped_gemm_kargs> gemm_descs;
+        gemm_descs.reserve(group_count);
+
+        for(int i = 0; i < group_count; ++i)
+        {
+            const ck_tile::index_t M = Ms[i];
+            const ck_tile::index_t N = Ns[i];
+            const ck_tile::index_t K = Ks[i];
+
+            stride_As[i] = f_get_default_stride(M, K, stride_As[i], ALayout{});
+            stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], BLayout{});
+            stride_Es[i] = f_get_default_stride(M, N, stride_Es[i], ELayout{});
+            stride_D0[i] = f_get_default_stride(M, N, stride_D0[i], D0Layout{});
+            stride_D1[i] = f_get_default_stride(M, N, stride_D1[i], D1Layout{});
+
+            a_m_k_tensors.push_back(ck_tile::HostTensor<ADataType>(
+                f_host_tensor_descriptor(M, K, stride_As[i], ALayout{})));
+            b_k_n_tensors.push_back(ck_tile::HostTensor<BDataType>(
+                f_host_tensor_descriptor(K, N, stride_Bs[i], BLayout{})));
+            e_m_n_tensors.push_back(ck_tile::HostTensor<EDataType>(
+                f_host_tensor_descriptor(M, N, stride_Es[i], ELayout{})));
+            d0_m_n_tensors.push_back(ck_tile::HostTensor<D0DataType>(
+                f_host_tensor_descriptor(M, N, stride_D0[i], D0Layout{})));
+            d1_m_n_tensors.push_back(ck_tile::HostTensor<D1DataType>(
+                f_host_tensor_descriptor(M, N, stride_D1[i], D1Layout{})));
+
+            std::cout << "gemm[" << i << "]" << " a_m_k: " << a_m_k_tensors[i].mDesc
+                      << " b_k_n: " << b_k_n_tensors[i].mDesc
+                      << " e_m_n: " << e_m_n_tensors[i].mDesc
+                      << " d0_m_n: " << d0_m_n_tensors[i].mDesc
+                      << " d1_m_n: " << d1_m_n_tensors[i].mDesc << std::endl;
+
+            ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
+            ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);
+            ck_tile::FillUniformDistribution<D0DataType>{-2.f, 2.f}(d0_m_n_tensors[i]);
+            ck_tile::FillUniformDistribution<D1DataType>{-1.f, 1.f}(d1_m_n_tensors[i]);
+
+            a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                a_m_k_tensors[i].get_element_space_size_in_bytes()));
+            b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                b_k_n_tensors[i].get_element_space_size_in_bytes()));
+            e_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                e_m_n_tensors[i].get_element_space_size_in_bytes()));
+            d0_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                d0_m_n_tensors[i].get_element_space_size_in_bytes()));
+            d1_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                d1_m_n_tensors[i].get_element_space_size_in_bytes()));
+
+            a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data());
+            b_k_n_dev_buf[i]->ToDevice(b_k_n_tensors[i].data());
+            e_m_n_dev_buf[i]->SetZero();
+            d0_m_n_dev_buf[i]->ToDevice(d0_m_n_tensors[i].data());
+            d1_m_n_dev_buf[i]->ToDevice(d1_m_n_tensors[i].data());
+
+            const void* p_a = a_m_k_dev_buf[i]->GetDeviceBuffer();
+            const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
+            void* p_e       = e_m_n_dev_buf[i]->GetDeviceBuffer();
+
+            std::array<const void*, DsDataType::size()> ds_ptr_buf = {
+                d0_m_n_dev_buf[i]->GetDeviceBuffer(), d1_m_n_dev_buf[i]->GetDeviceBuffer()};
+            std::array<ck_tile::index_t, DsDataType::size()> stridesDs = {stride_D0[i],
+                                                                          stride_D1[i]};
+
+            gemm_descs.push_back({p_a,
+                                  p_b,
+                                  ds_ptr_buf,
+                                  p_e,
+                                  kbatch,
+                                  M,
+                                  N,
+                                  K,
+                                  stride_As[i],
+                                  stride_Bs[i],
+                                  stridesDs,
+                                  stride_Es[i]});
+        }
+
+        ck_tile::DeviceMem gemm_workspace;
+        gemm_workspace.Realloc(get_workspace_size(gemm_descs));
+
+        invoke_grouped_gemm<ALayout, BLayout, ELayout>(gemm_descs,
+                                                       ck_tile::stream_config{nullptr, false, 1},
+                                                       gemm_workspace.GetDeviceBuffer());
+
+        // Copy results back to host for validation
+        for(int i = 0; i < group_count; i++)
+        {
+            e_m_n_dev_buf[i]->FromDevice(e_m_n_tensors[i].data());
+        }
+
+        std::vector<ck_tile::HostTensor<EDataType>> e_m_n_host_refs;
+        e_m_n_host_refs.reserve(group_count);
+
+        bool pass{true};
+        for(int i = 0; i < group_count; ++i)
+        {
+            e_m_n_host_refs.push_back(ck_tile::HostTensor<EDataType>(
+                f_host_tensor_descriptor(Ms[i], Ns[i], stride_Es[i], ELayout{})));
+
+            e_m_n_host_refs[i].SetZero();
+
+            ck_tile::reference_gemm_multiple_d<ADataType,
+                                               BDataType,
+                                               DsDataType,
+                                               AccDataType,
+                                               EDataType,
+                                               MultiplyMultiply>(
+                a_m_k_tensors[i],
+                b_k_n_tensors[i],
+                {d0_m_n_tensors[i], d1_m_n_tensors[i]},
+                e_m_n_host_refs[i]);
+            const float max_accumulated_value =
+                *std::max_element(e_m_n_host_refs[i].mData.begin(), e_m_n_host_refs[i].mData.end());
+
+            const auto rtol_atol = calculate_rtol_atol(Ks[i], 1, max_accumulated_value);
+
+            pass &=
+                ck_tile::check_err(e_m_n_tensors[i],
+                                   e_m_n_host_refs[i],
+                                   "Error: Incorrect results! in group [" + std::to_string(i) + "]",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+            std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+        EXPECT_TRUE(pass);
+    }
+};
diff --git a/test/ck_tile/grouped_gemm_preshuffle/CMakeLists.txt b/test/ck_tile/grouped_gemm_preshuffle/CMakeLists.txt
new file mode 100644
index 0000000000..68120efc7e
--- /dev/null
+++ b/test/ck_tile/grouped_gemm_preshuffle/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+
+if(GPU_TARGETS MATCHES "gfx94|gfx95")
+    add_gtest_executable(test_ck_tile_grouped_gemm_preshuffle test_grouped_gemm_preshuffle.cpp)
+    target_compile_options(test_ck_tile_grouped_gemm_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+endif()
diff --git a/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle.cpp b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle.cpp
new file mode 100644
index 0000000000..cf10853b3f
--- /dev/null
+++ b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_grouped_gemm_preshuffle_util.hpp"
+
+using F16 = ck_tile::half_t;
+using F8  = ck_tile::fp8_t;
+using F32 = float;
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// Custom tuple-like structure for kernel configuration
+template <typename ALayout_,
+          typename BLayout_,
+          typename CLayout_,
+          typename ADataType_,
+          typename BDataType_,
+          typename AccDataType_,
+          typename CDataType_,
+          int M_Tile_val_,
+          int N_Tile_val_,
+          int K_Tile_val_,
+          int BlockPerCu_val_>
+struct KernelConfig
+{
+    using ALayoutType = ALayout_;
+    using BLayoutType = BLayout_;
+    using CLayoutType = CLayout_;
+    using ADataType   = ADataType_;
+    using BDataType   = BDataType_;
+    using AccDataType = AccDataType_;
+    using CDataType   = CDataType_;
+
+    static constexpr int M_Tile_     = M_Tile_val_;
+    static constexpr int N_Tile_     = N_Tile_val_;
+    static constexpr int K_Tile_     = K_Tile_val_;
+    static constexpr int BlockPerCu_ = BlockPerCu_val_;
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //               ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, M_Tile, N_Tile, K_Tile, BlockPerCu
+    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,       16,     64,    256,         1>,
+    KernelConfig<    Row,     Col,     Row,       F8,        F8,          F32,       F16,       16,     64,    256,         1>,
+    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,      128,    128,    128,         2>,
+    KernelConfig<    Row,     Col,     Row,       F8,        F8,          F32,       F16,      128,    128,    128,         2>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileGroupedGemmPreshuffle, KernelTypes);
+
+#include "test_grouped_gemm_preshuffle_ut_cases.inc"
+#include "test_grouped_gemm_preshuffle_prefill_cases.inc"
diff --git a/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_prefill_cases.inc b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_prefill_cases.inc
new file mode 100644
index 0000000000..340d807ba2
--- /dev/null
+++ b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_prefill_cases.inc
@@ -0,0 +1,61 @@
+#pragma once
+
+// Test with prefill config struct
+TYPED_TEST(TestCkTileGroupedGemmPreshuffle, PrefillVariant)
+{
+    const int group_count = 4;
+    const int kbatch      = 1;
+    std::vector<int> Ms;
+    std::vector<int> Ns;
+    std::vector<int> Ks;
+    std::vector<int> stride_As;
+    std::vector<int> stride_Bs;
+    std::vector<int> stride_Cs;
+
+    for(int i = 0; i < group_count; i++)
+    {
+
+        Ms.push_back(256 + 128 * i);
+        Ns.push_back(256 + 128 * i);
+        Ks.push_back(128 * (i + 1));
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Cs.push_back(Ns[i]);
+    }
+
+    this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count);
+}
+
+TYPED_TEST(TestCkTileGroupedGemmPreshuffle, VariedDimensions)
+{
+    const int group_count = 6;
+    const int kbatch      = 1;
+    std::vector<int> Ms;
+    std::vector<int> Ns;
+    std::vector<int> Ks;
+    std::vector<int> stride_As;
+    std::vector<int> stride_Bs;
+    std::vector<int> stride_Cs;
+
+    std::vector<std::tuple<int, int, int>> test_cases = {{64, 128, 256},
+                                                         {128, 256, 512},
+                                                         {256, 512, 1024},
+                                                         {512, 256, 128},
+                                                         {128, 128, 128},
+                                                         {64, 512, 256}};
+
+    for(int i = 0; i < group_count; i++)
+    {
+        auto [M, N, K] = test_cases[i];
+        Ms.push_back(M);
+        Ns.push_back(N);
+        Ks.push_back(K);
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Cs.push_back(Ns[i]);
+    }
+
+    this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count);
+}
diff --git a/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_ut_cases.inc b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_ut_cases.inc
new file mode 100644
index 0000000000..beca5e62b5
--- /dev/null
+++ b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_ut_cases.inc
@@ -0,0 +1,53 @@
+#pragma once
+
+// kPadK is not needed for these k values
+TYPED_TEST(TestCkTileGroupedGemmPreshuffle, kPadKFalse)
+{
+    const int group_count = 4;
+    const int kbatch      = 1;
+    std::vector<int> Ms;
+    std::vector<int> Ns;
+    std::vector<int> Ks;
+    std::vector<int> stride_As;
+    std::vector<int> stride_Bs;
+    std::vector<int> stride_Cs;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        Ms.push_back(256 + 256 * i);
+        Ns.push_back(256 + 512 * i);
+        Ks.push_back(512 + 256 * i);
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Cs.push_back(Ns[i]);
+    }
+
+    this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count);
+}
+
+// kPadK is needed to be true for these k values
+TYPED_TEST(TestCkTileGroupedGemmPreshuffle, kPadKTrue)
+{
+    const int group_count = 4;
+    const int kbatch      = 1;
+    std::vector<int> Ms;
+    std::vector<int> Ns;
+    std::vector<int> Ks;
+    std::vector<int> stride_As;
+    std::vector<int> stride_Bs;
+    std::vector<int> stride_Cs;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        Ms.push_back(256 + 256 * i);
+        Ns.push_back(256 + 512 * i);
+        Ks.push_back(512 + 128 * i);
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Cs.push_back(Ns[i]);
+    }
+
+    this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count);
+}
diff --git a/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_util.hpp b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_util.hpp
new file mode 100644
index 0000000000..d2f64920fd
--- /dev/null
+++ b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_util.hpp
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+{
+#if defined(CK_GFX950_SUPPORT)
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 64;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 128;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 32;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 64;
+#endif
+}
+
+template <typename Tuple>
+class TestCkTileGroupedGemmPreshuffle : public ::testing::Test
+{
+    protected:
+    using ALayout     = typename Tuple::ALayoutType;
+    using BLayout     = typename Tuple::BLayoutType;
+    using CLayout     = typename Tuple::CLayoutType;
+    using ADataType   = typename Tuple::ADataType;
+    using BDataType   = typename Tuple::BDataType;
+    using AccDataType = typename Tuple::AccDataType;
+    using CDataType   = typename Tuple::CDataType;
+    using PrecType    = BDataType;
+    using DsLayout    = ck_tile::tuple<>; // not used
+    using DsDataType  = ck_tile::tuple<>; // not used
+
+    static const bool kPadM = false;
+    static const bool kPadN = false;
+    static const bool kPadK = true; // preshuffle pipeline requires k padding
+
+    static const int kBlockPerCu = Tuple::BlockPerCu_;
+
+    // Tile dimensions from tuple
+    static const ck_tile::index_t M_Tile = Tuple::M_Tile_;
+    static const ck_tile::index_t N_Tile = Tuple::N_Tile_;
+    static const ck_tile::index_t K_Tile = Tuple::K_Tile_;
+
+    static const ck_tile::index_t M_Warp = 1;
+    static const ck_tile::index_t N_Warp = 4;
+    static const ck_tile::index_t K_Warp = 1;
+
+    static const ck_tile::index_t M_Warp_Tile = 16;
+    static const ck_tile::index_t N_Warp_Tile = 16;
+    static const ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<BDataType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer = true;  // preshuffle v2 uses ping-pong smem
+    static constexpr bool TransposeC       = false; // transpose c is not supported
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+    template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+    auto calculate_rtol_atol(const ck_tile::index_t K,
+                             const ck_tile::index_t kbatch,
+                             const float max_accumulated_value)
+    {
+        using ComputeType =
+            std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+        // Calculate thresholds
+        const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+            ck_tile::integer_divide_ceil(K, kbatch));
+        const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+            max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+        // Calculate error due to split_k accumulation
+        const auto rtol_split_k =
+            ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+        const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+            max_accumulated_value, kbatch);
+        // Use higher threshold
+        return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+    }
+
+    using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs<>;
+    inline std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
+    {
+        return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg<>);
+    }
+
+    template <typename T>
+    auto shuffle_b(const ck_tile::HostTensor<T>& t)
+    {
+        assert(t.get_lengths().size() == 2);
+        int n_                = t.get_lengths()[1];
+        int k_                = t.get_lengths()[0];
+        constexpr int divisor = N_Warp_Tile == 32 ? 2 : 4;
+        ck_tile::HostTensor<T> t_view(
+            {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+
+    template <typename ALayout, typename BLayout, typename CLayout>
+    void invoke_grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
+                             const ck_tile::stream_config& s,
+                             void* kargs_ptr)
+    {
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+        using TilePartitioner = ck_tile::
+            GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+        // for testing purposes, we can hardcode the values here as we what is compatible with
+        // pipeline
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<kPadM,
+                                             kPadN,
+                                             kPadK,
+                                             DoubleSmemBuffer,
+                                             ALayout,
+                                             BLayout,
+                                             CLayout,
+                                             TransposeC,
+                                             /*UseStructuredSparsity*/ false,
+                                             /*Persistent*/ false,
+                                             /*NumWaveGroups*/ 1,
+                                             /*Preshuffle*/ true>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline =
+            ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain = gemm_descs[0].k_batch * K_Tile;
+        const ck_tile::index_t K_split = (gemm_descs[0].K + k_grain - 1) / k_grain * K_Tile;
+        const ck_tile::index_t num_loop =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       TileParitionerGroupNum,
+                                                       TileParitionerM01>::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        float ave_time{0};
+
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto memory_operation = memory_operation_.value;
+            using UniversalGemmProblem =
+                ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                      BDataType,
+                                                      AccDataType,
+                                                      GemmShape,
+                                                      GemmUniversalTraits,
+                                                      ck_tile::GemmPipelineScheduler::Default,
+                                                      has_hot_loop_v,
+                                                      tail_number_v>;
+            using GemmPipeline =
+                ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV2<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 DsLayout,
+                                                 CLayout,
+                                                 ck_tile::element_wise::PassThrough,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+            using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKargs(gemm_descs);
+            EXPECT_TRUE(Kernel::IsSupportedArgument(kargs));
+            const dim3 grids  = Kernel::GridSize(gemm_descs);
+            const dim3 blocks = Kernel::BlockSize();
+
+            ck_tile::hip_check_error(hipMemcpyWithStream(kargs_ptr,
+                                                         kargs.data(),
+                                                         get_workspace_size(gemm_descs),
+                                                         hipMemcpyHostToDevice,
+                                                         s.stream_id_));
+
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<kBlockPerCu>(
+                    Kernel{},
+                    grids,
+                    blocks,
+                    0,
+                    ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                    gemm_descs.size()));
+            return ave_time;
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(gemm_descs[0].k_batch == 1)
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::set>{});
+            }
+            else
+            {
+                // EXPECT TO FAIL because splitk is not supported
+                EXPECT_FALSE(true);
+            }
+        };
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    }
+
+    public:
+    void Run(const std::vector<int>& Ms,
+             const std::vector<int>& Ns,
+             const std::vector<int>& Ks,
+             std::vector<int>& stride_As,
+             std::vector<int>& stride_Bs,
+             std::vector<int>& stride_Cs,
+             const int kbatch      = 1,
+             const int group_count = 16)
+    {
+
+        using namespace ck_tile::literals;
+        auto f_host_tensor_descriptor = [](std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
+        std::vector<ck_tile::HostTensor<BDataType>> b_k_n_tensors;
+        std::vector<ck_tile::HostTensor<CDataType>> c_m_n_tensors;
+
+        a_m_k_tensors.reserve(group_count);
+        b_k_n_tensors.reserve(group_count);
+        c_m_n_tensors.reserve(group_count);
+
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> a_m_k_dev_buf;
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> b_k_n_dev_buf;
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> c_m_n_dev_buf;
+
+        a_m_k_dev_buf.reserve(group_count);
+        b_k_n_dev_buf.reserve(group_count);
+        c_m_n_dev_buf.reserve(group_count);
+
+        std::vector<grouped_gemm_kargs> gemm_descs;
+        gemm_descs.reserve(group_count);
+
+        for(int i = 0; i < group_count; ++i)
+        {
+            const ck_tile::index_t M = Ms[i];
+            const ck_tile::index_t N = Ns[i];
+            const ck_tile::index_t K = Ks[i];
+
+            stride_As[i] = f_get_default_stride(M, K, stride_As[i], ALayout{});
+            stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], BLayout{});
+            stride_Cs[i] = f_get_default_stride(M, N, stride_Cs[i], CLayout{});
+
+            a_m_k_tensors.push_back(ck_tile::HostTensor<ADataType>(
+                f_host_tensor_descriptor(M, K, stride_As[i], ALayout{})));
+            b_k_n_tensors.push_back(ck_tile::HostTensor<BDataType>(
+                f_host_tensor_descriptor(K, N, stride_Bs[i], BLayout{})));
+            c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>(
+                f_host_tensor_descriptor(M, N, stride_Cs[i], CLayout{})));
+
+            ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
+            ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);
+
+            // Host-side preshuffle of B
+            auto b_shuffle_host = shuffle_b(b_k_n_tensors[i]);
+
+            a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                a_m_k_tensors[i].get_element_space_size_in_bytes()));
+            b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                b_shuffle_host.get_element_space_size_in_bytes()));
+            c_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                c_m_n_tensors[i].get_element_space_size_in_bytes()));
+
+            a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data());
+            b_k_n_dev_buf[i]->ToDevice(b_shuffle_host.data());
+            c_m_n_dev_buf[i]->SetZero();
+            c_m_n_tensors[i].SetZero();
+
+            const void* p_a = a_m_k_dev_buf[i]->GetDeviceBuffer();
+            const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
+            void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
+
+            gemm_descs.push_back({p_a,
+                                  p_b,
+                                  {/*ds_ptr*/},
+                                  p_c,
+                                  kbatch,
+                                  M,
+                                  N,
+                                  K,
+                                  stride_As[i],
+                                  stride_Bs[i],
+                                  {/*stride_Ds*/},
+                                  stride_Cs[i]});
+        }
+
+        ck_tile::DeviceMem gemm_workspace;
+        gemm_workspace.Realloc(get_workspace_size(gemm_descs));
+
+        invoke_grouped_gemm<ALayout, BLayout, CLayout>(gemm_descs,
+                                                       ck_tile::stream_config{nullptr, false, 1},
+                                                       gemm_workspace.GetDeviceBuffer());
+
+        // Copy results back to host for validation
+        for(int i = 0; i < group_count; i++)
+        {
+            c_m_n_dev_buf[i]->FromDevice(c_m_n_tensors[i].data());
+        }
+
+        bool pass{true};
+        for(int i = 0; i < group_count; ++i)
+        {
+            ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+                f_host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{}));
+            c_m_n_host_ref.SetZero();
+            ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+                a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref);
+            const float max_accumulated_value =
+                *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+            const auto rtol_atol =
+                calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                    Ks[i], kbatch, max_accumulated_value);
+            pass &= ck_tile::check_err(c_m_n_tensors[i],
+                                       c_m_n_host_ref,
+                                       "Error: Incorrect results!",
+                                       rtol_atol.at(ck_tile::number<0>{}),
+                                       rtol_atol.at(ck_tile::number<1>{}));
+        }
+        EXPECT_TRUE(pass);
+    }
+};
diff --git a/test/ck_tile/image_to_column/CMakeLists.txt b/test/ck_tile/image_to_column/CMakeLists.txt
index 247358dd4d..8873a846fc 100644
--- a/test/ck_tile/image_to_column/CMakeLists.txt
+++ b/test/ck_tile/image_to_column/CMakeLists.txt
@@ -1,4 +1,3 @@
-# Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_tile_image_to_column test_tile_image_to_column.cpp)
 endif()
diff --git a/test/ck_tile/layernorm2d/CMakeLists.txt b/test/ck_tile/layernorm2d/CMakeLists.txt
index c909d6cf40..e924f39e7a 100644
--- a/test/ck_tile/layernorm2d/CMakeLists.txt
+++ b/test/ck_tile/layernorm2d/CMakeLists.txt
@@ -14,7 +14,7 @@ function(create_tile_layernorm2d_fwd SUFFIX)
     target_compile_options(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS})
 endfunction()
 
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     set(LAYERNORM2D_FWD_KNOWN_APIS "fwd;bwd")
     set(LAYERNORM2D_FWD_ENABLE_APIS  "fwd" CACHE STRING
         "semicolon-separated list of APIs to generate (${LAYERNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
diff --git a/test/ck_tile/layernorm2d/generate.py b/test/ck_tile/layernorm2d/generate.py
index c4366f6662..f7446c0148 100644
--- a/test/ck_tile/layernorm2d/generate.py
+++ b/test/ck_tile/layernorm2d/generate.py
@@ -75,54 +75,17 @@ struct layernorm2d_fwd_traits_
     using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
     using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
-            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
-
     using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
     using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
+    
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN           = kPadN_;
     static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
diff --git a/test/ck_tile/moe_smoothquant/CMakeLists.txt b/test/ck_tile/moe_smoothquant/CMakeLists.txt
index b6c8a395b6..019e87323f 100644
--- a/test/ck_tile/moe_smoothquant/CMakeLists.txt
+++ b/test/ck_tile/moe_smoothquant/CMakeLists.txt
@@ -1,5 +1,4 @@
-# Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     function (add_moe_smoothquant_test TARGET_NAME MAIN_SRC)
         message(DEBUG "adding ${TARGET_NAME}")
         add_gtest_executable(${TARGET_NAME} ${MAIN_SRC})
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
index f2875c72c8..c6ef822f64 100644
--- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
@@ -53,7 +53,7 @@ float moe_smoothquant_(const S& s, A a)
     using Kernel = ck_tile::MoeSmoothquant<Pipeline>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
index ced9b4ef3d..40b09dca00 100644
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
@@ -38,54 +38,16 @@ struct moe_smoothquant_traits_
     using InputType  = ck_tile::remove_cvref_t<InputType_>;
     using OutputType = ck_tile::remove_cvref_t<OutputType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
-            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
-
-    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
-    using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using BlockTile      = ck_tile::sequence<Block_M, Block_N>;
+    using Vector         = ck_tile::sequence<1, Vector_N_>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
+    using Shape          = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN    = kPadN_;
     static constexpr bool kTwoPass = kTwoPass_;
diff --git a/test/ck_tile/moe_sorting/CMakeLists.txt b/test/ck_tile/moe_sorting/CMakeLists.txt
index 5abc7df5a9..48d8e1392f 100644
--- a/test/ck_tile/moe_sorting/CMakeLists.txt
+++ b/test/ck_tile/moe_sorting/CMakeLists.txt
@@ -1,5 +1,5 @@
-# Currently ck_tile is only built on gfx90a, gfx942 and gfx950
-if(GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950" OR GPU_TARGETS MATCHES "gfx90a")
+# Currently ck_tile is only built on gfx90a, gfx942, gfx950, gfx11 and gfx12
+if(GPU_TARGETS MATCHES "gfx942|gfx950|gfx90a|gfx11|gfx12")
 
     function(add_moe_sorting_test EXECUTABLE USE_2D_BUF)
         add_gtest_executable(${EXECUTABLE} test_moe_sorting.cpp moe_sorting_api.cpp)
diff --git a/test/ck_tile/permute/CMakeLists.txt b/test/ck_tile/permute/CMakeLists.txt
index 4256ad8de1..8574813be3 100644
--- a/test/ck_tile/permute/CMakeLists.txt
+++ b/test/ck_tile/permute/CMakeLists.txt
@@ -1,5 +1,4 @@
-# Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
 
     function(add_permute_test TARGET_NAME MAIN_SRC)
         add_gtest_executable(${TARGET_NAME} ${MAIN_SRC})
diff --git a/test/ck_tile/permute/test_permute_util.hpp b/test/ck_tile/permute/test_permute_util.hpp
index 5494749541..2028f56bb8 100644
--- a/test/ck_tile/permute/test_permute_util.hpp
+++ b/test/ck_tile/permute/test_permute_util.hpp
@@ -17,9 +17,11 @@
 #include <utility>
 #include <vector>
 
+#if !CK_TILE_USE_WMMA
 #ifdef PERMUTE_USE_ALTERNATIVE_IMPL
 #include "alternative_impl/matrix_core_swizzle.hpp"
 #endif
+#endif
 
 namespace detail {
 template <int bytes>
@@ -193,6 +195,7 @@ class TestCkTilePermute : public ::testing::Test
 
             return permute<DataType>(a, stream_config);
         };
+#if !CK_TILE_USE_WMMA
 #ifdef PERMUTE_USE_ALTERNATIVE_IMPL
         // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
         if((perm == std::string("0,1,4,2,5,3,6") || perm == std::string("0,1,2,4,5,3,6") ||
@@ -278,6 +281,7 @@ class TestCkTilePermute : public ::testing::Test
             }
         }
         else
+#endif
 #endif
         {
             run_permute();
diff --git a/test/ck_tile/reduce/CMakeLists.txt b/test/ck_tile/reduce/CMakeLists.txt
index 052669e20a..0ba5974f6c 100644
--- a/test/ck_tile/reduce/CMakeLists.txt
+++ b/test/ck_tile/reduce/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_reduce2d test_reduce2d.cpp)
     if(result EQUAL 0)
         target_link_libraries(test_ck_tile_reduce2d PRIVATE utility)
diff --git a/test/ck_tile/reduce/test_reduce2d.cpp b/test/ck_tile/reduce/test_reduce2d.cpp
index ff807e52c9..ded0406797 100644
--- a/test/ck_tile/reduce/test_reduce2d.cpp
+++ b/test/ck_tile/reduce/test_reduce2d.cpp
@@ -59,7 +59,7 @@ class TestCkTileReduce : public ::testing::Test
         using Kernel = ck_tile::Reduce<Problem>;
 
         // Launch configuration
-        constexpr ck_tile::index_t kBlockSize  = 256;
+        const ck_tile::index_t kBlockSize      = Kernel::BlockSize();
         constexpr ck_tile::index_t kBlockPerCu = 1;
 
         ck_tile::index_t kGridSize =
diff --git a/test/ck_tile/rmsnorm2d/CMakeLists.txt b/test/ck_tile/rmsnorm2d/CMakeLists.txt
index 5a73b0914c..c60d73aafd 100644
--- a/test/ck_tile/rmsnorm2d/CMakeLists.txt
+++ b/test/ck_tile/rmsnorm2d/CMakeLists.txt
@@ -14,7 +14,7 @@ function(create_tile_rmsnorm2d_fwd SUFFIX)
     target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
 endfunction()
 
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     set(RMSNORM2D_FWD_KNOWN_APIS "fwd;bwd")
     set(RMSNORM2D_FWD_ENABLE_APIS  "fwd" CACHE STRING
         "semicolon-separated list of APIs to generate (${RMSNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
diff --git a/test/ck_tile/rmsnorm2d/generate.py b/test/ck_tile/rmsnorm2d/generate.py
index 1a1c842b3c..3bcc427e83 100644
--- a/test/ck_tile/rmsnorm2d/generate.py
+++ b/test/ck_tile/rmsnorm2d/generate.py
@@ -74,54 +74,17 @@ struct rmsnorm2d_fwd_traits_
     using YScaleDataType      = ck_tile::remove_cvref_t<YScaleDataType_>;
     using UnquantYDataType    = ck_tile::remove_cvref_t<UnquantYDataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
-            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
-
     using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
     using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
+    
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN        = kPadN_;
     static constexpr bool kSaveInvRms  = kSaveInvRms_;
@@ -238,7 +201,7 @@ float rmsnorm2d_fwd_(const S& s, A a)
     using Kernel = ck_tile::Rmsnorm2dFwd<Pipeline, Epilogue>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
diff --git a/test/ck_tile/smoothquant/CMakeLists.txt b/test/ck_tile/smoothquant/CMakeLists.txt
index 548fc03a41..381923803f 100644
--- a/test/ck_tile/smoothquant/CMakeLists.txt
+++ b/test/ck_tile/smoothquant/CMakeLists.txt
@@ -1,5 +1,4 @@
-# Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     function (add_smoothquant_test TARGET_NAME MAIN_SRC)
         message(DEBUG "adding ${TARGET_NAME}")
 
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp b/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp
index 8929289cdb..138afcffaf 100644
--- a/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp
+++ b/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp
@@ -49,7 +49,7 @@ float smoothquant_(const S& s, A a)
     using Kernel = ck_tile::Smoothquant<Pipeline>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
diff --git a/test/ck_tile/smoothquant/smoothquant.hpp b/test/ck_tile/smoothquant/smoothquant.hpp
index b1d5dae3d3..ef0c36aa98 100644
--- a/test/ck_tile/smoothquant/smoothquant.hpp
+++ b/test/ck_tile/smoothquant/smoothquant.hpp
@@ -49,54 +49,17 @@ struct smoothquant_traits_
 {
     using DataType = ck_tile::remove_cvref_t<DataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
-            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
-        }
-    }();
-
     static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
     static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
 
     static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
     static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
 
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+    using BlockTile      = ck_tile::sequence<Block_M, Block_N>;
+    using Vector         = ck_tile::sequence<1, Vector_N_>;
+    using ThreadPerBlock = ck_tile::sequence<ThreadPerBlock_M_, ThreadPerBlock_N_>;
 
-    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
-    using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, ThreadPerBlock, Vector>;
 
     static constexpr bool kPadN    = kPadN_;
     static constexpr bool kTwoPass = kTwoPass_;
diff --git a/test/ck_tile/topk_softmax/CMakeLists.txt b/test/ck_tile/topk_softmax/CMakeLists.txt
index 046eaf6649..cd524eca01 100644
--- a/test/ck_tile/topk_softmax/CMakeLists.txt
+++ b/test/ck_tile/topk_softmax/CMakeLists.txt
@@ -10,8 +10,7 @@ function(add_tile_topk_softmax_test SUFFIX)
     target_compile_options(${TEST_NAME} PRIVATE ${TEST_TOPK_SOFTMAX_COMPILE_OPTIONS})
 endfunction()
 
-# Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_tile_topk_softmax_test(fp16)
     add_tile_topk_softmax_test(bf16)
 else()
diff --git a/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp
index 3ff23e2e11..3b1b6ffb6d 100644
--- a/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp
+++ b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp
@@ -32,13 +32,17 @@ TEST_F(PrintStaticEncodingPatternTest, PrintThreadRakedPattern)
 {
     // Test printing thread raked pattern
     using PatternType =
-        TileDistributionEncodingPattern2D<64, 8, 16, 4, tile_distribution_pattern::thread_raked>;
+        tile_distribution_encoding_pattern_2d<64,
+                                              8,
+                                              16,
+                                              4,
+                                              tile_distribution_pattern::thread_raked>;
     PatternType pattern;
 
     std::string output = CapturePrintOutput(pattern);
 
     // Verify the output contains expected information
-    EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
+    EXPECT_TRUE(output.find("tile_distribution_encoding_pattern_2d") != std::string::npos);
     EXPECT_TRUE(output.find("BlockSize:64") != std::string::npos);
     EXPECT_TRUE(output.find("YPerTile:8") != std::string::npos);
     EXPECT_TRUE(output.find("XPerTile:16") != std::string::npos);
@@ -52,13 +56,17 @@ TEST_F(PrintStaticEncodingPatternTest, PrintWarpRakedPattern)
 {
     // Test printing warp raked pattern
     using PatternType =
-        TileDistributionEncodingPattern2D<128, 16, 32, 8, tile_distribution_pattern::warp_raked>;
+        tile_distribution_encoding_pattern_2d<128,
+                                              16,
+                                              32,
+                                              8,
+                                              tile_distribution_pattern::warp_raked>;
     PatternType pattern;
 
     std::string output = CapturePrintOutput(pattern);
 
     // Verify the output contains expected information
-    EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
+    EXPECT_TRUE(output.find("tile_distribution_encoding_pattern_2d") != std::string::npos);
     EXPECT_TRUE(output.find("BlockSize:128") != std::string::npos);
     EXPECT_TRUE(output.find("YPerTile:16") != std::string::npos);
     EXPECT_TRUE(output.find("XPerTile:32") != std::string::npos);
@@ -72,13 +80,17 @@ TEST_F(PrintStaticEncodingPatternTest, PrintBlockRakedPattern)
 {
     // Test printing block raked pattern
     using PatternType =
-        TileDistributionEncodingPattern2D<256, 32, 64, 16, tile_distribution_pattern::block_raked>;
+        tile_distribution_encoding_pattern_2d<256,
+                                              32,
+                                              64,
+                                              16,
+                                              tile_distribution_pattern::block_raked>;
     PatternType pattern;
 
     std::string output = CapturePrintOutput(pattern);
 
     // Verify the output contains expected information
-    EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
+    EXPECT_TRUE(output.find("tile_distribution_encoding_pattern_2d") != std::string::npos);
     EXPECT_TRUE(output.find("BlockSize:256") != std::string::npos);
     EXPECT_TRUE(output.find("YPerTile:32") != std::string::npos);
     EXPECT_TRUE(output.find("XPerTile:64") != std::string::npos);
diff --git a/test/contraction/test_contraction_interface_xdl.cpp b/test/contraction/test_contraction_interface_xdl.cpp
index 58232d209c..16812ce809 100644
--- a/test/contraction/test_contraction_interface_xdl.cpp
+++ b/test/contraction/test_contraction_interface_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <stdexcept>
 #include <vector>
@@ -181,3 +181,14 @@ TEST(TestContractionSupportedArgs, DEMemoryAccess)
         wrapper.isSupported(Dims, Dims, Dims, Dims, Strides, Strides, Strides, InvalidStrides));
     EXPECT_TRUE(wrapper.isSupported(Dims, Dims, Dims, Dims, Strides, Strides, Strides, Strides));
 }
+
+int main(int argc, char** argv)
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        std::cout << "FP32/64 are not supported on gfx11 and gfx12." << std::endl;
+        return 0;
+    }
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/contraction/test_contraction_xdl.cpp b/test/contraction/test_contraction_xdl.cpp
index 2bfd5a6a66..3a65b57b0e 100644
--- a/test/contraction/test_contraction_xdl.cpp
+++ b/test/contraction/test_contraction_xdl.cpp
@@ -12,10 +12,11 @@
 #include "profiler/profile_contraction_impl.hpp"
 #include "profiler/profile_contraction_utils.hpp"
 
-using F16  = ck::half_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
-using F64  = double;
+static ck::index_t instance_index = -1;
+using F16                         = ck::half_t;
+using BF16                        = ck::bhalf_t;
+using F32                         = float;
+using F64                         = double;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -95,7 +96,8 @@ class TestContraction : public ::testing::Test
                                                                     StridesA,
                                                                     StridesB,
                                                                     StridesC,
-                                                                    StridesD);
+                                                                    StridesD,
+                                                                    instance_index);
             EXPECT_TRUE(pass);
         }
     }
@@ -219,3 +221,18 @@ TYPED_TEST(TestContractionScaleMixedPrecision, scale)
     this->template Run<2>({{8, 16}, {1, 1}, {8, 16}});
     this->template Run<2>({{1, 1}, {1, 1}, {1, 1}});
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 2)
+    {
+        instance_index = atoi(argv[1]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1: instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
index 5cb8731b26..8904b58d8d 100644
--- a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
+++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
@@ -11,6 +11,9 @@
 
 #include "profiler/profile_conv_tensor_rearrange_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestConvTensorRearrange : public ::testing::Test
 {
@@ -25,18 +28,24 @@ class TestConvTensorRearrange : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass && ck::profiler::profile_conv_tensor_rearrange_impl<NDimSpatial,
-                                                                            ImLayout,
-                                                                            InDataType,
-                                                                            OutDataType,
-                                                                            ConvTensorRearrangeOp>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_conv_tensor_rearrange_impl<NDimSpatial,
+                                                                                   ImLayout,
+                                                                                   InDataType,
+                                                                                   OutDataType,
+                                                                                   ConvTensorRearrangeOp>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -157,3 +166,19 @@ TYPED_TEST(TestConvTensorRearrange3d, Test3D)
     this->template Run<3, int8_t, int8_t>();
 #endif
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
index df8b77aba1..36d31d53fa 100644
--- a/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
+++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
@@ -188,7 +188,7 @@ TEST_F(TestConvTensorRearrangeInterface1ScalarPerVector, X1ScalarPerVector)
     is_supported = this->template Run<ColumnToImage>();
     EXPECT_TRUE(is_supported);
     // vector load C % ScalarPerVector, dilation
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {8}, {1}, {2}, {0}, {0}};
     is_supported     = this->template Run<ImageToColumn>();
     EXPECT_TRUE(is_supported);
     is_supported = this->template Run<ColumnToImage>();
@@ -234,7 +234,7 @@ TEST_F(TestConvTensorRearrangeInterface4ScalarPerVector, X4ScalarPerVector)
     is_supported = this->template Run<ColumnToImage>();
     EXPECT_FALSE(is_supported);
     // vector load C % ScalarPerVector, dilation
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {8}, {1}, {2}, {0}, {0}};
     is_supported     = this->template Run<ImageToColumn>();
     EXPECT_FALSE(is_supported);
     is_supported = this->template Run<ColumnToImage>();
@@ -250,13 +250,13 @@ TEST_F(TestConvTensorRearrangeInterface4ScalarPerVector, X4ScalarPerVector)
 TEST_F(TestConvTensorRearrangeInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC)
 {
     // C = 3
-    this->conv_param  = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}};
+    this->conv_param  = {1, 1, 1, 1, 3, {4}, {5}, {1}, {1}, {0}, {0}};
     bool is_supported = this->template Run<ImageToColumn>();
     EXPECT_FALSE(is_supported);
     is_supported = this->template Run<ColumnToImage>();
     EXPECT_FALSE(is_supported);
     // C = 4
-    this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}};
+    this->conv_param = {1, 1, 1, 1, 8, {4}, {5}, {1}, {1}, {0}, {0}};
     is_supported     = this->template Run<ImageToColumn>();
     EXPECT_TRUE(is_supported);
     is_supported = this->template Run<ColumnToImage>();
diff --git a/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp b/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp
index 9d2b6cf577..5ad4f63d30 100644
--- a/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp
@@ -9,7 +9,8 @@
 #include <gtest/gtest.h>
 
 #include "profiler/profile_conv_bwd_data_impl.hpp"
-
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestConvndBwdData : public ::testing::Test
 {
@@ -20,10 +21,15 @@ class TestConvndBwdData : public ::testing::Test
     template <ck::index_t NDimSpatial>
     void Run()
     {
-        for(auto& param : conv_params)
+        EXPECT_FALSE(conv_params.empty());
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             bool pass;
-            EXPECT_FALSE(conv_params.empty());
             pass = ck::profiler::profile_conv_bwd_data_impl<
                 NDimSpatial,
                 ck::tuple_element_t<NDimSpatial - 1,
@@ -44,7 +50,8 @@ class TestConvndBwdData : public ::testing::Test
                           1,     // init_method integer value
                           false, // do_log
                           false, // time_kernel
-                          param);
+                          param,
+                          instance_index);
             EXPECT_TRUE(pass);
         }
     }
@@ -91,3 +98,19 @@ TYPED_TEST(TestConvndBwdData, Conv3dBwdData)
         {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/convnd_fwd/convnd_fwd_xdl.cpp b/test/convnd_fwd/convnd_fwd_xdl.cpp
index fe8798ceb8..6d507211ce 100644
--- a/test/convnd_fwd/convnd_fwd_xdl.cpp
+++ b/test/convnd_fwd/convnd_fwd_xdl.cpp
@@ -10,6 +10,8 @@
 
 #include "profiler/profile_conv_fwd_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestConvndFwd : public ::testing::Test
 {
@@ -20,10 +22,15 @@ class TestConvndFwd : public ::testing::Test
     template <ck::index_t NDimSpatial>
     void Run()
     {
-        for(auto& param : conv_params)
+        EXPECT_FALSE(conv_params.empty());
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             bool pass;
-            EXPECT_FALSE(conv_params.empty());
             pass = ck::profiler::profile_conv_fwd_impl<
                 NDimSpatial,
                 ck::tuple_element_t<NDimSpatial - 1,
@@ -44,7 +51,8 @@ class TestConvndFwd : public ::testing::Test
                           1,     // init_method integer value
                           false, // do_log
                           false, // time_kernel
-                          param);
+                          param,
+                          instance_index);
             EXPECT_TRUE(pass);
         }
     }
@@ -90,3 +98,19 @@ TYPED_TEST(TestConvndFwd, Conv3dFwd)
         {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/data_type/test_bf8_fnuz.cpp b/test/data_type/test_bf8_fnuz.cpp
index 4ff796a614..f028c0da73 100644
--- a/test/data_type/test_bf8_fnuz.cpp
+++ b/test/data_type/test_bf8_fnuz.cpp
@@ -43,9 +43,8 @@ TEST(BF8FNUZ, ConvertFP32Nearest)
                 type_convert<float>(f8_convert_rne<bf8_fnuz_t>(std::numeric_limits<float>::max())),
                 abs_tol);
     // convert inf float to bf8_fnuz_t and check if it is qNan
-    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
-                f8_convert_rne<bf8_fnuz_t>(std::numeric_limits<float>::infinity()),
-                abs_tol);
+    ASSERT_EQ(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+              f8_convert_rne<bf8_fnuz_t>(std::numeric_limits<float>::infinity()));
     // positive norm float value to bf8 and back, check if holds
     float pos_float = 0.0000762939f;
     ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(pos_float)), abs_tol);
@@ -80,9 +79,8 @@ TEST(BF8FNUZ, ConvertFP32Stochastic)
                 type_convert<float>(f8_convert_sr<bf8_fnuz_t>(std::numeric_limits<float>::max())),
                 abs_tol);
     // convert inf float to bf8_fnuz_t and check if it is qNan
-    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
-                f8_convert_sr<bf8_fnuz_t>(std::numeric_limits<float>::infinity()),
-                abs_tol);
+    ASSERT_EQ(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+              f8_convert_sr<bf8_fnuz_t>(std::numeric_limits<float>::infinity()));
     // positive norm float value to bf8 and back, check if holds
     float pos_float = 0.0000762939f;
     ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(pos_float)), abs_tol);
@@ -118,9 +116,8 @@ TEST(BF8FNUZ, ConvertFP16Nearest)
                 type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                 abs_tol);
     // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN
-    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
-                f8_convert_rne<bf8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
-                abs_tol);
+    ASSERT_EQ(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+              f8_convert_rne<bf8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()));
     // positive norm fp16 value to bf8 and back, check if holds
     half_t pos_half = half_t{0.0000762939};
     ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(pos_half)), abs_tol);
@@ -155,9 +152,8 @@ TEST(BF8FNUZ, ConvertFP16Stochastic)
                 type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                 abs_tol);
     // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN
-    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
-                f8_convert_sr<bf8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
-                abs_tol);
+    ASSERT_EQ(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+              f8_convert_sr<bf8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()));
     // positive norm fp16 value to bf8 and back, check if holds
     half_t pos_half = half_t{0.0000762939};
     ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(pos_half)), abs_tol);
diff --git a/test/data_type/test_fp8_fnuz.cpp b/test/data_type/test_fp8_fnuz.cpp
index c2ec6dad94..0cf775f947 100644
--- a/test/data_type/test_fp8_fnuz.cpp
+++ b/test/data_type/test_fp8_fnuz.cpp
@@ -48,9 +48,8 @@ TEST(FP8FNUZ, ConvertFP32Nearest)
                 type_convert<float>(f8_convert_rne<f8_fnuz_t>(std::numeric_limits<float>::max())),
                 abs_tol);
     // convert inf float to f8_fnuz_t and check if it is qNan
-    ASSERT_NEAR(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
-                f8_convert_rne<f8_fnuz_t>(std::numeric_limits<float>::infinity()),
-                abs_tol);
+    ASSERT_EQ(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
+              f8_convert_rne<f8_fnuz_t>(std::numeric_limits<float>::infinity()));
     // positive norm float value to fp8 and back, check if holds
     float pos_float = 0.017578125f;
     ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_fnuz_t>(pos_float)), abs_tol);
@@ -85,9 +84,8 @@ TEST(FP8FNUZ, ConvertFP32Stochastic)
                 type_convert<float>(f8_convert_sr<f8_fnuz_t>(std::numeric_limits<float>::max())),
                 abs_tol);
     // convert inf float to f8_fnuz_t and check if it is qNan
-    ASSERT_NEAR(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
-                f8_convert_sr<f8_fnuz_t>(std::numeric_limits<float>::infinity()),
-                abs_tol);
+    ASSERT_EQ(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
+              f8_convert_sr<f8_fnuz_t>(std::numeric_limits<float>::infinity()));
     // positive norm float value to fp8 and back, check if holds
     float pos_float = 0.017578125f;
     ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_fnuz_t>(pos_float)), abs_tol);
@@ -122,9 +120,8 @@ TEST(FP8FNUZ, ConvertFP16Nearest)
                 type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                 abs_tol);
     // convert QuietNaN fp16 to f8_fnuz_t and check if it is QuietNaN
-    ASSERT_NEAR(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
-                f8_convert_rne<f8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
-                abs_tol);
+    ASSERT_EQ(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
+              f8_convert_rne<f8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()));
     // positive norm fp16 value to fp8 and back, check if holds
     half_t pos_half = half_t{0.017578125};
     ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(pos_half)), abs_tol);
@@ -159,9 +156,8 @@ TEST(FP8FNUZ, ConvertFP16Stochastic)
                 type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                 abs_tol);
     // convert QuietNaN fp16 to f8_fnuz_t and check if it is QuietNaN
-    ASSERT_NEAR(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
-                f8_convert_sr<f8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
-                abs_tol);
+    ASSERT_EQ(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
+              f8_convert_sr<f8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()));
     // positive norm fp16 value to fp8 and back, check if holds
     half_t pos_half = half_t{0.017578125};
     ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(pos_half)), abs_tol);
diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
index d5ce77dc2b..43192ed139 100644
--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestElementwiseLayernorm : public ::testing::Test
 {
@@ -25,15 +28,20 @@ class TestElementwiseLayernorm : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}, {4096, 8192}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& length = lengths[i];
             bool success = ck::profiler::profile_elementwise_layernorm_impl<ADataType,
                                                                             BDataType,
                                                                             GammaDataType,
                                                                             BetaDataType,
                                                                             AccDataType,
                                                                             YDataType>(
-                true, 2, false, false, length);
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -45,3 +53,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes);
 TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm/gemm_bf16.cpp b/test/gemm/gemm_bf16.cpp
index cde5c45aea..d06735a097 100644
--- a/test/gemm/gemm_bf16.cpp
+++ b/test/gemm/gemm_bf16.cpp
@@ -31,4 +31,4 @@ using AccDataType = float;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp
index cad250c6fb..185412ab65 100644
--- a/test/gemm/gemm_fp16.cpp
+++ b/test/gemm/gemm_fp16.cpp
@@ -31,4 +31,4 @@ using AccDataType = float;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp
index c35aa77ea7..cf2d0bd01d 100644
--- a/test/gemm/gemm_fp32.cpp
+++ b/test/gemm/gemm_fp32.cpp
@@ -31,4 +31,4 @@ using AccDataType = float;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp
index e67c8ba4f3..7bf89d9c20 100644
--- a/test/gemm/gemm_fp64.cpp
+++ b/test/gemm/gemm_fp64.cpp
@@ -31,4 +31,4 @@ using AccDataType = double;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp
index 6ece05e306..f1a19dd61a 100644
--- a/test/gemm/gemm_int8.cpp
+++ b/test/gemm/gemm_int8.cpp
@@ -31,4 +31,4 @@ using AccDataType = int32_t;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_standalone_xdl_fp16.cpp b/test/gemm/gemm_standalone_xdl_fp16.cpp
index 201a49dcd3..bee2d1ec80 100644
--- a/test/gemm/gemm_standalone_xdl_fp16.cpp
+++ b/test/gemm/gemm_standalone_xdl_fp16.cpp
@@ -105,6 +105,7 @@ int main(int argc, char* argv[])
 
     bool do_verification = true;
     bool time_kernel     = true;
+    int problem_index    = -1;
 
     if(argc == 1)
     {
@@ -115,16 +116,28 @@ int main(int argc, char* argv[])
         do_verification = std::stoi(argv[1]);
         time_kernel     = std::stoi(argv[2]);
     }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+        problem_index   = std::stoi(argv[3]);
+    }
     else
     {
         std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
-                  << "arg2: time kernel (0=no, 1=yes)" << std::endl;
+                  << "arg2: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg3: problem index (0-35, -1 means all)" << std::endl;
         return 0;
     }
 
     bool pass = true;
-    for(auto& p : problems)
+    for(size_t i = 0; i < problems.size(); i++)
     {
+        if(problem_index != -1 && problem_index != static_cast<ck::index_t>(i))
+        {
+            continue;
+        }
+        auto& p                           = problems[i];
         GemmParams& problem_size          = std::get<0>(p);
         const LayoutConfig& layout_config = std::get<1>(p);
         const auto& factory               = std::get<2>(p);
diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 6c46f4ee89..043eca0e83 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -261,6 +261,44 @@ struct TestGemm
             return true;
         }
     }
+
+    template <template <class...> class DeviceGemmPtr_,
+              typename ALayout,
+              typename BLayout,
+              typename CLayout,
+              typename ADataType,
+              typename BDataType,
+              typename CDataType,
+              typename AElementwiseOperation,
+              typename BElementwiseOperation,
+              typename CElementwiseOperation>
+    bool IsSupportedArgument(DeviceGemmPtr_<ALayout,
+                                            BLayout,
+                                            CLayout,
+                                            ADataType,
+                                            BDataType,
+                                            CDataType,
+                                            AElementwiseOperation,
+                                            BElementwiseOperation,
+                                            CElementwiseOperation>* gemmPtr,
+                             const GemmParams& params = GemmParams{})
+    {
+        auto invoker_ptr  = gemmPtr->MakeInvokerPointer();
+        auto argument_ptr = gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(nullptr),
+                                                         static_cast<BDataType*>(nullptr),
+                                                         static_cast<CDataType*>(nullptr),
+                                                         params.M,
+                                                         params.N,
+                                                         params.K,
+                                                         params.StrideA,
+                                                         params.StrideB,
+                                                         params.StrideC,
+                                                         AElementwiseOperation{},
+                                                         BElementwiseOperation{},
+                                                         CElementwiseOperation{});
+
+        return gemmPtr->IsSupportedArgument(argument_ptr.get());
+    }
 };
 
 } // namespace gemm_util
diff --git a/test/gemm/run_gemm_test.inc b/test/gemm/run_gemm_test.inc
index d208bb5a7b..0ab2a63367 100644
--- a/test/gemm/run_gemm_test.inc
+++ b/test/gemm/run_gemm_test.inc
@@ -1,13 +1,39 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
-int run_gemm_test()
+int run_gemm_test(int argc, char* argv[])
 {
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
 
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    ck::gemm_util::GemmParams params;
+    ck::index_t instance_index = -1;
+    if(argc == 1)
+    {
+        // use default params
+    }
+    else if(argc == 4 || argc == 5)
+    {
+        params.M       = atoi(argv[1]);
+        params.N       = atoi(argv[2]);
+        params.K       = atoi(argv[3]);
+        params.StrideA = params.M;
+        params.StrideB = params.N;
+        params.StrideC = params.K;
 
+        if(argc == 5)
+        {
+            instance_index = atoi(argv[4]);
+        }
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1-4: M N K instance_index(-1 means all)" << std::endl;
+    }
+    std::cout << "Params (M, N, K, index) " << params.M << " " << params.N << " " << params.K << " "
+              << instance_index << std::endl;
     auto test = [&](auto a_layout, auto b_layout, auto c_layout) {
         bool pass = true;
 
@@ -24,10 +50,31 @@ int run_gemm_test()
         const auto gemmPtrs =
             ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
                 DeviceOp>::GetInstances();
-
+        ck::index_t num_instance = 0;
         for(auto& gemmPtr : gemmPtrs)
         {
-            pass &= ck::gemm_util::TestGemm<AccDataType>{}(gemmPtr.get());
+            if(instance_index == -1)
+            {
+                pass &= ck::gemm_util::TestGemm<AccDataType>{}(gemmPtr.get(), params);
+            }
+            else
+            {
+                auto test_gemm = ck::gemm_util::TestGemm<AccDataType>{};
+                if(test_gemm.IsSupportedArgument(gemmPtr.get(), params))
+                {
+                    if(num_instance == instance_index)
+                    {
+                        pass &= test_gemm(gemmPtr.get(), params);
+                    }
+                    num_instance++;
+                }
+            }
+        }
+
+        if(instance_index != -1)
+        {
+            std::cout << "TestGemm_instance (" << instance_index << "/" << num_instance
+                      << "): " << (pass ? "Passed" : "Failed") << std::endl;
         }
 
         return pass;
diff --git a/test/gemm_add/CMakeLists.txt b/test/gemm_add/CMakeLists.txt
index ab4c781847..fe0a08c0c9 100644
--- a/test/gemm_add/CMakeLists.txt
+++ b/test/gemm_add/CMakeLists.txt
@@ -1,19 +1,71 @@
-add_gtest_executable(test_gemm_add test_gemm_add_xdl.hpp)
+# Implements test instances for MultipleD with xdl and wmma support.
+
+add_gtest_executable(test_gemm_add_xdl test_gemm_add_xdl.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_gemm_add PRIVATE utility device_gemm_add_instance)
+    target_link_libraries(test_gemm_add_xdl PRIVATE utility device_gemm_add_instance)
 endif()
 
-add_gtest_executable(test_gemm_add_relu test_gemm_add_relu_xdl.cpp)
+add_gtest_executable(test_gemm_add_relu_xdl test_gemm_add_relu_xdl.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_relu PRIVATE utility device_gemm_add_instance device_gemm_add_relu_instance)
+    target_link_libraries(test_gemm_add_relu_xdl PRIVATE utility device_gemm_add_instance device_gemm_add_relu_instance)
 endif()
 
-add_gtest_executable(test_gemm_add_silu test_gemm_add_silu_xdl.cpp)
+add_gtest_executable(test_gemm_add_silu_xdl test_gemm_add_silu_xdl.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_silu PRIVATE utility device_gemm_add_instance device_gemm_add_silu_instance)
+    target_link_libraries(test_gemm_add_silu_xdl PRIVATE utility device_gemm_add_instance device_gemm_add_silu_instance)
 endif()
 
-add_gtest_executable(test_gemm_add_fastgelu test_gemm_add_fastgelu_xdl.cpp)
+add_gtest_executable(test_gemm_add_silu_wmma test_gemm_add_silu_wmma.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_fastgelu PRIVATE utility device_gemm_add_instance device_gemm_add_fastgelu_instance)
+    target_link_libraries(test_gemm_add_silu_wmma PRIVATE utility device_gemm_add_instance device_gemm_add_silu_instance)
 endif()
+
+add_gtest_executable(test_gemm_add_fastgelu_xdl test_gemm_add_fastgelu_xdl.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_add_fastgelu_xdl PRIVATE utility device_gemm_add_instance device_gemm_add_fastgelu_instance)
+endif()
+
+add_gtest_executable(test_gemm_fastgelu_wmma test_gemm_fastgelu_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_fastgelu_wmma PRIVATE utility device_gemm_fastgelu_instance)
+endif()
+
+add_gtest_executable(test_gemm_add_fastgelu_wmma test_gemm_add_fastgelu_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_add_fastgelu_wmma PRIVATE utility device_gemm_add_fastgelu_instance)
+endif()
+
+add_gtest_executable(test_gemm_add_wmma test_gemm_add_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_add_wmma PRIVATE utility device_gemm_add_instance)
+endif()
+
+add_gtest_executable(test_gemm_add_add_fastgelu_wmma test_gemm_add_add_fastgelu_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_add_add_fastgelu_wmma PRIVATE utility device_gemm_add_add_fastgelu_instance)
+endif()
+
+add_gtest_executable(test_gemm_multiply_multiply_wmma test_gemm_multiply_multiply_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_multiply_multiply_wmma PRIVATE utility device_gemm_multiply_multiply_instance)
+endif()
+
+add_gtest_executable(test_gemm_add_multiply_wmma test_gemm_add_multiply_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_add_multiply_wmma PRIVATE utility device_gemm_add_multiply_instance)
+endif()
+
+add_gtest_executable(test_gemm_multiply_add_wmma test_gemm_multiply_add_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_multiply_add_wmma PRIVATE utility device_gemm_multiply_add_instance)
+endif()
+
+add_gtest_executable(test_gemm_bilinear_wmma test_gemm_bilinear_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_bilinear_wmma PRIVATE utility device_gemm_bilinear_instance)
+endif()
+
+add_gtest_executable(test_gemm_add_relu_wmma test_gemm_add_relu_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_add_relu_wmma PRIVATE utility device_gemm_add_relu_instance)
+endif()
\ No newline at end of file
diff --git a/test/gemm_add/test_gemm_add_add_fastgelu_wmma.cpp b/test/gemm_add/test_gemm_add_add_fastgelu_wmma.cpp
new file mode 100644
index 0000000000..25da138a04
--- /dev/null
+++ b/test/gemm_add/test_gemm_add_add_fastgelu_wmma.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_add_add_fastgelu_impl.hpp"
+#include "test_gemm_common.hpp"
+
+template <typename Tuple>
+class TestGemmAddAddFastgelu : public TestGemmD0D1Common<Tuple>
+{
+    using ProfileCall = typename TestGemmD0D1Common<Tuple>::ProfileCall;
+
+    public:
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_add_fastgelu_impl<
+            typename TestGemmD0D1Common<Tuple>::ADataType,
+            typename TestGemmD0D1Common<Tuple>::BDataType,
+            typename TestGemmD0D1Common<Tuple>::AccDataType,
+            typename TestGemmD0D1Common<Tuple>::D0DataType,
+            typename TestGemmD0D1Common<Tuple>::D1DataType,
+            typename TestGemmD0D1Common<Tuple>::EDataType,
+            typename TestGemmD0D1Common<Tuple>::ALayout,
+            typename TestGemmD0D1Common<Tuple>::BLayout,
+            typename TestGemmD0D1Common<Tuple>::D0Layout,
+            typename TestGemmD0D1Common<Tuple>::D1Layout,
+            typename TestGemmD0D1Common<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes =
+    ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F16, Row, Row, Row, Row, Row>,
+                     std::tuple<F16, F16, F32, F16, F16, F16, Row, Col, Row, Row, Row>,
+                     std::tuple<F16, F16, F32, F16, F16, F16, Col, Row, Row, Row, Row>,
+                     std::tuple<F16, F16, F32, F16, F16, F16, Col, Col, Row, Row, Row>>;
+
+TYPED_TEST_SUITE(TestGemmAddAddFastgelu, KernelTypes);
+TYPED_TEST(TestGemmAddAddFastgelu, Test_FP16FP16) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_fastgelu_wmma.cpp b/test/gemm_add/test_gemm_add_fastgelu_wmma.cpp
new file mode 100644
index 0000000000..df70a0cc99
--- /dev/null
+++ b/test/gemm_add/test_gemm_add_fastgelu_wmma.cpp
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_add_fastgelu_impl.hpp"
+#include "test_gemm_common.hpp"
+
+template <typename Tuple>
+class TestGemmAddFastgelu : public TestGemmD0Common<Tuple>
+{
+    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
+
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_fastgelu_impl<
+            typename TestGemmD0Common<Tuple>::ADataType,
+            typename TestGemmD0Common<Tuple>::BDataType,
+            typename TestGemmD0Common<Tuple>::AccDataType,
+            typename TestGemmD0Common<Tuple>::D0DataType,
+            typename TestGemmD0Common<Tuple>::EDataType,
+            typename TestGemmD0Common<Tuple>::ALayout,
+            typename TestGemmD0Common<Tuple>::BLayout,
+            typename TestGemmD0Common<Tuple>::D0Layout,
+            typename TestGemmD0Common<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Row, Col, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Col, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Col, Col, Row, Row>>;
+
+TYPED_TEST_SUITE(TestGemmAddFastgelu, KernelTypes);
+TYPED_TEST(TestGemmAddFastgelu, Test_FP16FP16) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_fastgelu_xdl.cpp b/test/gemm_add/test_gemm_add_fastgelu_xdl.cpp
index 1b12ab7528..0e034f46b5 100644
--- a/test/gemm_add/test_gemm_add_fastgelu_xdl.cpp
+++ b/test/gemm_add/test_gemm_add_fastgelu_xdl.cpp
@@ -1,37 +1,29 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "ck/ck.hpp"
 #include "profiler/profile_gemm_add_fastgelu_impl.hpp"
-#include "test_gemm_add_xdl.hpp"
+#include "test_gemm_common.hpp"
 
 template <typename Tuple>
-class TestGemmAddFastgelu : public TestGemmAdd<Tuple>
+class TestGemmAddFastgelu : public TestGemmD0Common<Tuple>
 {
-    private:
-    using ADataType   = std::tuple_element_t<0, Tuple>;
-    using BDataType   = std::tuple_element_t<1, Tuple>;
-    using AccDataType = std::tuple_element_t<2, Tuple>;
-    using D0DataType  = std::tuple_element_t<3, Tuple>;
-    using EDataType   = std::tuple_element_t<4, Tuple>;
-    using ALayout     = std::tuple_element_t<5, Tuple>;
-    using BLayout     = std::tuple_element_t<6, Tuple>;
-    using D0Layout    = std::tuple_element_t<7, Tuple>;
-    using ELayout     = std::tuple_element_t<8, Tuple>;
+    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
 
-    constexpr static auto ProfileGemmAddFastgeluImpl =
-        ck::profiler::profile_gemm_add_fastgelu_impl<ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     D0DataType,
-                                                     EDataType,
-                                                     ALayout,
-                                                     BLayout,
-                                                     D0Layout,
-                                                     ELayout>;
-
-    decltype(ProfileGemmAddFastgeluImpl) GetImpl() override { return ProfileGemmAddFastgeluImpl; }
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_fastgelu_impl<
+            typename TestGemmD0Common<Tuple>::ADataType,
+            typename TestGemmD0Common<Tuple>::BDataType,
+            typename TestGemmD0Common<Tuple>::AccDataType,
+            typename TestGemmD0Common<Tuple>::D0DataType,
+            typename TestGemmD0Common<Tuple>::EDataType,
+            typename TestGemmD0Common<Tuple>::ALayout,
+            typename TestGemmD0Common<Tuple>::BLayout,
+            typename TestGemmD0Common<Tuple>::D0Layout,
+            typename TestGemmD0Common<Tuple>::ELayout>;
+    }
 };
 
 using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
diff --git a/test/gemm_add/test_gemm_add_multiply_wmma.cpp b/test/gemm_add/test_gemm_add_multiply_wmma.cpp
new file mode 100644
index 0000000000..be4a99d69f
--- /dev/null
+++ b/test/gemm_add/test_gemm_add_multiply_wmma.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "test_gemm_common.hpp"
+#include "profiler/profile_gemm_add_multiply_impl.hpp"
+
+template <typename Tuple>
+class TestGemmAddMultiply : public TestGemmD0D1Common<Tuple>
+{
+    using ProfileCall = typename TestGemmD0D1Common<Tuple>::ProfileCall;
+
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_multiply_impl<
+            typename TestGemmD0D1Common<Tuple>::ADataType,
+            typename TestGemmD0D1Common<Tuple>::BDataType,
+            typename TestGemmD0D1Common<Tuple>::AccDataType,
+            typename TestGemmD0D1Common<Tuple>::D0DataType,
+            typename TestGemmD0D1Common<Tuple>::D1DataType,
+            typename TestGemmD0D1Common<Tuple>::EDataType,
+            typename TestGemmD0D1Common<Tuple>::ALayout,
+            typename TestGemmD0D1Common<Tuple>::BLayout,
+            typename TestGemmD0D1Common<Tuple>::D0Layout,
+            typename TestGemmD0D1Common<Tuple>::D1Layout,
+            typename TestGemmD0D1Common<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes =
+    ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F16, Row, Col, Row, Row, Row>,
+                     std::tuple<F16, F16, F32, F16, F16, F16, Row, Row, Row, Row, Row>,
+                     std::tuple<F16, F16, F32, F16, F16, F16, Col, Col, Row, Row, Row>,
+                     std::tuple<F16, F16, F32, F16, F16, F16, Col, Row, Row, Row, Row>>;
+
+TYPED_TEST_SUITE(TestGemmAddMultiply, KernelTypes);
+// Due to F16 shuffle data type tests has to run with limited K size. Change instances to FP32?
+TYPED_TEST(TestGemmAddMultiply, Test) { this->Run({{16, 32, 64}, {2048, 1024, 256}}); }
diff --git a/test/gemm_add/test_gemm_add_relu_wmma.cpp b/test/gemm_add/test_gemm_add_relu_wmma.cpp
new file mode 100644
index 0000000000..76c66a11b1
--- /dev/null
+++ b/test/gemm_add/test_gemm_add_relu_wmma.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_add_relu_impl.hpp"
+#include "test_gemm_common.hpp"
+
+template <typename Tuple>
+class TestGemmAddRelu : public TestGemmD0Common<Tuple>
+{
+    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
+
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_relu_impl<
+            typename TestGemmD0Common<Tuple>::ADataType,
+            typename TestGemmD0Common<Tuple>::BDataType,
+            typename TestGemmD0Common<Tuple>::AccDataType,
+            typename TestGemmD0Common<Tuple>::D0DataType,
+            typename TestGemmD0Common<Tuple>::EDataType,
+            typename TestGemmD0Common<Tuple>::ALayout,
+            typename TestGemmD0Common<Tuple>::BLayout,
+            typename TestGemmD0Common<Tuple>::D0Layout,
+            typename TestGemmD0Common<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, Row, Row, Row, Row>>;
+
+TYPED_TEST_SUITE(TestGemmAddRelu, KernelTypes);
+TYPED_TEST(TestGemmAddRelu, Test_BF16FP16) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_relu_xdl.cpp b/test/gemm_add/test_gemm_add_relu_xdl.cpp
index e8b769b1cb..4b445e8e41 100644
--- a/test/gemm_add/test_gemm_add_relu_xdl.cpp
+++ b/test/gemm_add/test_gemm_add_relu_xdl.cpp
@@ -1,37 +1,29 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "ck/ck.hpp"
 #include "profiler/profile_gemm_add_relu_impl.hpp"
-#include "test_gemm_add_xdl.hpp"
+#include "test_gemm_common.hpp"
 
 template <typename Tuple>
-class TestGemmAddRelu : public TestGemmAdd<Tuple>
+class TestGemmAddRelu : public TestGemmD0Common<Tuple>
 {
-    private:
-    using ADataType   = std::tuple_element_t<0, Tuple>;
-    using BDataType   = std::tuple_element_t<1, Tuple>;
-    using AccDataType = std::tuple_element_t<2, Tuple>;
-    using D0DataType  = std::tuple_element_t<3, Tuple>;
-    using EDataType   = std::tuple_element_t<4, Tuple>;
-    using ALayout     = std::tuple_element_t<5, Tuple>;
-    using BLayout     = std::tuple_element_t<6, Tuple>;
-    using D0Layout    = std::tuple_element_t<7, Tuple>;
-    using ELayout     = std::tuple_element_t<8, Tuple>;
+    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
 
-    constexpr static auto ProfileGemmAddReluImpl =
-        ck::profiler::profile_gemm_add_relu_impl<ADataType,
-                                                 BDataType,
-                                                 AccDataType,
-                                                 D0DataType,
-                                                 EDataType,
-                                                 ALayout,
-                                                 BLayout,
-                                                 D0Layout,
-                                                 ELayout>;
-
-    decltype(ProfileGemmAddReluImpl) GetImpl() override { return ProfileGemmAddReluImpl; }
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_relu_impl<
+            typename TestGemmD0Common<Tuple>::ADataType,
+            typename TestGemmD0Common<Tuple>::BDataType,
+            typename TestGemmD0Common<Tuple>::AccDataType,
+            typename TestGemmD0Common<Tuple>::D0DataType,
+            typename TestGemmD0Common<Tuple>::EDataType,
+            typename TestGemmD0Common<Tuple>::ALayout,
+            typename TestGemmD0Common<Tuple>::BLayout,
+            typename TestGemmD0Common<Tuple>::D0Layout,
+            typename TestGemmD0Common<Tuple>::ELayout>;
+    }
 };
 
 using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
diff --git a/test/gemm_add/test_gemm_add_silu_wmma.cpp b/test/gemm_add/test_gemm_add_silu_wmma.cpp
new file mode 100644
index 0000000000..7afa68dfe9
--- /dev/null
+++ b/test/gemm_add/test_gemm_add_silu_wmma.cpp
@@ -0,0 +1,34 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_add_silu_impl.hpp"
+#include "test_gemm_common.hpp"
+
+template <typename Tuple>
+class TestGemmAddSilu : public TestGemmD0Common<Tuple>
+{
+    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
+
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_silu_impl<
+            typename TestGemmD0Common<Tuple>::ADataType,
+            typename TestGemmD0Common<Tuple>::BDataType,
+            typename TestGemmD0Common<Tuple>::AccDataType,
+            typename TestGemmD0Common<Tuple>::D0DataType,
+            typename TestGemmD0Common<Tuple>::EDataType,
+            typename TestGemmD0Common<Tuple>::ALayout,
+            typename TestGemmD0Common<Tuple>::BLayout,
+            typename TestGemmD0Common<Tuple>::D0Layout,
+            typename TestGemmD0Common<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, Row, Row, Row, Row>>;
+
+TYPED_TEST_SUITE(TestGemmAddSilu, KernelTypes);
+TYPED_TEST(TestGemmAddSilu, Test_BF16FP16_BF16FP16) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_silu_xdl.cpp b/test/gemm_add/test_gemm_add_silu_xdl.cpp
index 75fa59a8e7..6bd0ee422d 100644
--- a/test/gemm_add/test_gemm_add_silu_xdl.cpp
+++ b/test/gemm_add/test_gemm_add_silu_xdl.cpp
@@ -1,37 +1,29 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "ck/ck.hpp"
 #include "profiler/profile_gemm_add_silu_impl.hpp"
-#include "test_gemm_add_xdl.hpp"
+#include "test_gemm_common.hpp"
 
 template <typename Tuple>
-class TestGemmAddSilu : public TestGemmAdd<Tuple>
+class TestGemmAddSilu : public TestGemmD0Common<Tuple>
 {
-    private:
-    using ADataType   = std::tuple_element_t<0, Tuple>;
-    using BDataType   = std::tuple_element_t<1, Tuple>;
-    using AccDataType = std::tuple_element_t<2, Tuple>;
-    using D0DataType  = std::tuple_element_t<3, Tuple>;
-    using EDataType   = std::tuple_element_t<4, Tuple>;
-    using ALayout     = std::tuple_element_t<5, Tuple>;
-    using BLayout     = std::tuple_element_t<6, Tuple>;
-    using D0Layout    = std::tuple_element_t<7, Tuple>;
-    using ELayout     = std::tuple_element_t<8, Tuple>;
+    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
 
-    constexpr static auto ProfileGemmAddSiluImpl =
-        ck::profiler::profile_gemm_add_silu_impl<ADataType,
-                                                 BDataType,
-                                                 AccDataType,
-                                                 D0DataType,
-                                                 EDataType,
-                                                 ALayout,
-                                                 BLayout,
-                                                 D0Layout,
-                                                 ELayout>;
-
-    decltype(ProfileGemmAddSiluImpl) GetImpl() override { return ProfileGemmAddSiluImpl; }
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_silu_impl<
+            typename TestGemmD0Common<Tuple>::ADataType,
+            typename TestGemmD0Common<Tuple>::BDataType,
+            typename TestGemmD0Common<Tuple>::AccDataType,
+            typename TestGemmD0Common<Tuple>::D0DataType,
+            typename TestGemmD0Common<Tuple>::EDataType,
+            typename TestGemmD0Common<Tuple>::ALayout,
+            typename TestGemmD0Common<Tuple>::BLayout,
+            typename TestGemmD0Common<Tuple>::D0Layout,
+            typename TestGemmD0Common<Tuple>::ELayout>;
+    }
 };
 
 using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
diff --git a/test/gemm_add/test_gemm_add_wmma.cpp b/test/gemm_add/test_gemm_add_wmma.cpp
new file mode 100644
index 0000000000..ae08d50fcc
--- /dev/null
+++ b/test/gemm_add/test_gemm_add_wmma.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_add_impl.hpp"
+#include "test_gemm_common.hpp"
+
+template <typename Tuple>
+class TestGemmAdd : public TestGemmD0Common<Tuple>
+{
+    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
+
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_impl<typename TestGemmD0Common<Tuple>::ADataType,
+                                                   typename TestGemmD0Common<Tuple>::BDataType,
+                                                   typename TestGemmD0Common<Tuple>::AccDataType,
+                                                   typename TestGemmD0Common<Tuple>::D0DataType,
+                                                   typename TestGemmD0Common<Tuple>::EDataType,
+                                                   typename TestGemmD0Common<Tuple>::ALayout,
+                                                   typename TestGemmD0Common<Tuple>::BLayout,
+                                                   typename TestGemmD0Common<Tuple>::D0Layout,
+                                                   typename TestGemmD0Common<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, Row, Row, Row, Row>>;
+
+TYPED_TEST_SUITE(TestGemmAdd, KernelTypes);
+TYPED_TEST(TestGemmAdd, Test_BF16FP16) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_xdl.cpp b/test/gemm_add/test_gemm_add_xdl.cpp
new file mode 100644
index 0000000000..6696c1ccf6
--- /dev/null
+++ b/test/gemm_add/test_gemm_add_xdl.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_add_impl.hpp"
+#include "test_gemm_common.hpp"
+
+template <typename Tuple>
+class TestGemmAdd : public TestGemmD0Common<Tuple>
+{
+    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
+
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_add_impl<typename TestGemmD0Common<Tuple>::ADataType,
+                                                   typename TestGemmD0Common<Tuple>::BDataType,
+                                                   typename TestGemmD0Common<Tuple>::AccDataType,
+                                                   typename TestGemmD0Common<Tuple>::D0DataType,
+                                                   typename TestGemmD0Common<Tuple>::EDataType,
+                                                   typename TestGemmD0Common<Tuple>::ALayout,
+                                                   typename TestGemmD0Common<Tuple>::BLayout,
+                                                   typename TestGemmD0Common<Tuple>::D0Layout,
+                                                   typename TestGemmD0Common<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>>;
+
+TYPED_TEST_SUITE(TestGemmAdd, KernelTypes);
+TYPED_TEST(TestGemmAdd, Test_BF16FP16_INT8) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_xdl.hpp b/test/gemm_add/test_gemm_add_xdl.hpp
deleted file mode 100644
index 11d3d1c10a..0000000000
--- a/test/gemm_add/test_gemm_add_xdl.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "gtest/gtest.h"
-#include "ck/ck.hpp"
-#include "profiler/profile_gemm_add_impl.hpp"
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-using I8   = int8_t;
-using BF16 = ck::bhalf_t;
-using F16  = ck::half_t;
-using F32  = float;
-
-template <typename Tuple>
-class TestGemmAdd : public ::testing::Test
-{
-    protected:
-    using ADataType   = std::tuple_element_t<0, Tuple>;
-    using BDataType   = std::tuple_element_t<1, Tuple>;
-    using AccDataType = std::tuple_element_t<2, Tuple>;
-    using D0DataType  = std::tuple_element_t<3, Tuple>;
-    using EDataType   = std::tuple_element_t<4, Tuple>;
-    using ALayout     = std::tuple_element_t<5, Tuple>;
-    using BLayout     = std::tuple_element_t<6, Tuple>;
-    using D0Layout    = std::tuple_element_t<7, Tuple>;
-    using ELayout     = std::tuple_element_t<8, Tuple>;
-
-    constexpr static auto ProfileGemmAddImpl = ck::profiler::profile_gemm_add_impl<ADataType,
-                                                                                   BDataType,
-                                                                                   AccDataType,
-                                                                                   D0DataType,
-                                                                                   EDataType,
-                                                                                   ALayout,
-                                                                                   BLayout,
-                                                                                   D0Layout,
-                                                                                   ELayout>;
-
-    virtual decltype(ProfileGemmAddImpl) GetImpl() { return ProfileGemmAddImpl; }
-
-    void Run()
-    {
-        std::vector<std::vector<ck::index_t>> lengths = {
-            {16, 32, 64}, {2048, 4096, 8192}, {2048, 1024, 16}};
-
-        bool all_success = true;
-
-        for(auto length : lengths)
-        {
-            int M        = length[0];
-            int N        = length[1];
-            int K        = length[2];
-            int StrideA  = ck::is_same_v<ALayout, Row> ? K : M;
-            int StrideB  = ck::is_same_v<BLayout, Row> ? N : K;
-            int StrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
-            int StrideE  = ck::is_same_v<ELayout, Row> ? N : M;
-
-            all_success =
-                all_success &
-                GetImpl()(true, 1, false, false, M, N, K, StrideA, StrideB, StrideD0, StrideE);
-        }
-
-        EXPECT_TRUE(all_success);
-    }
-};
-
-using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
-                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>>;
-
-TYPED_TEST_SUITE(TestGemmAdd, KernelTypes);
-TYPED_TEST(TestGemmAdd, Test_BF16FP16_INT8) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_bilinear_wmma.cpp b/test/gemm_add/test_gemm_bilinear_wmma.cpp
new file mode 100644
index 0000000000..dfa8ac7121
--- /dev/null
+++ b/test/gemm_add/test_gemm_bilinear_wmma.cpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_bilinear_impl.hpp"
+#include "test_gemm_common.hpp"
+
+template <typename Tuple>
+class TestGemmBilinear : public ::testing::Test
+{
+    private:
+    using ADataType   = std::tuple_element_t<0, Tuple>;
+    using BDataType   = std::tuple_element_t<1, Tuple>;
+    using AccDataType = std::tuple_element_t<2, Tuple>;
+    using D0DataType  = std::tuple_element_t<3, Tuple>;
+    using EDataType   = std::tuple_element_t<4, Tuple>;
+    using ALayout     = std::tuple_element_t<5, Tuple>;
+    using BLayout     = std::tuple_element_t<6, Tuple>;
+    using D0Layout    = std::tuple_element_t<7, Tuple>;
+    using ELayout     = std::tuple_element_t<8, Tuple>;
+
+    constexpr static auto ProfileGemmBilinearImpl =
+        ck::profiler::profile_gemm_bilinear_impl<ADataType,
+                                                 BDataType,
+                                                 AccDataType,
+                                                 D0DataType,
+                                                 EDataType,
+                                                 ALayout,
+                                                 BLayout,
+                                                 D0Layout,
+                                                 ELayout>;
+
+    public:
+    void Run(TestMatrixSizes const& lengths)
+    {
+        bool all_success = true;
+
+        for(auto length : lengths)
+        {
+            int M        = length[0];
+            int N        = length[1];
+            int K        = length[2];
+            int StrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+            int StrideB  = ck::is_same_v<BLayout, Row> ? N : K;
+            int StrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+            int StrideE  = ck::is_same_v<ELayout, Row> ? N : M;
+
+            all_success =
+                all_success &
+                ProfileGemmBilinearImpl(
+                    1, 1, false, true, M, N, K, StrideA, StrideB, StrideD0, StrideE, 1.F, 1.F);
+        }
+
+        EXPECT_TRUE(all_success);
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Row, Col, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Col, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Col, Col, Row, Row>,
+                                     std::tuple<I8, I8, I32, I8, I8, Row, Row, Row, Row>,
+                                     std::tuple<I8, I8, I32, I8, I8, Row, Col, Row, Row>,
+                                     std::tuple<I8, I8, I32, I8, I8, Col, Row, Row, Row>,
+                                     std::tuple<I8, I8, I32, I8, I8, Col, Col, Row, Row>>;
+
+TYPED_TEST_SUITE(TestGemmBilinear, KernelTypes);
+TYPED_TEST(TestGemmBilinear, Test) { this->Run(DefaultTestMatrixSizes); }
diff --git a/test/gemm_add/test_gemm_common.hpp b/test/gemm_add/test_gemm_common.hpp
new file mode 100644
index 0000000000..9ab6c335e9
--- /dev/null
+++ b/test/gemm_add/test_gemm_common.hpp
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/data_type.hpp"
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using I8   = int8_t;
+using I32  = int32_t;
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+using F8   = ck::f8_t;
+
+// M, N, K
+using TestMatrixSizes = std::vector<std::vector<ck::index_t>>;
+
+static const TestMatrixSizes DefaultTestMatrixSizes = {
+    {16, 32, 64}, {512, 2048, 4096}, {2048, 1024, 16}};
+
+template <typename Tuple>
+class TestGemmCommon : public ::testing::Test
+{
+    protected:
+    using ADataType   = std::tuple_element_t<0, Tuple>;
+    using BDataType   = std::tuple_element_t<1, Tuple>;
+    using AccDataType = std::tuple_element_t<2, Tuple>;
+    using EDataType   = std::tuple_element_t<3, Tuple>;
+    using ALayout     = std::tuple_element_t<4, Tuple>;
+    using BLayout     = std::tuple_element_t<5, Tuple>;
+    using ELayout     = std::tuple_element_t<6, Tuple>;
+
+    using ProfileCall = bool (*const)(int, int, bool, bool, int, int, int, int, int, int);
+
+    virtual ProfileCall GetImpl() = 0;
+
+    void Run(const TestMatrixSizes& lengths = DefaultTestMatrixSizes)
+    {
+        bool all_success = true;
+
+        for(auto length : lengths)
+        {
+            int M       = length[0];
+            int N       = length[1];
+            int K       = length[2];
+            int StrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            int StrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            int StrideE = ck::is_same_v<ELayout, Row> ? N : M;
+
+            all_success =
+                all_success & GetImpl()(1, 1, false, true, M, N, K, StrideA, StrideB, StrideE);
+        }
+
+        EXPECT_TRUE(all_success);
+    }
+};
+
+template <typename Tuple>
+class TestGemmD0Common : public ::testing::Test
+{
+    protected:
+    using ADataType   = std::tuple_element_t<0, Tuple>;
+    using BDataType   = std::tuple_element_t<1, Tuple>;
+    using AccDataType = std::tuple_element_t<2, Tuple>;
+    using D0DataType  = std::tuple_element_t<3, Tuple>;
+    using EDataType   = std::tuple_element_t<4, Tuple>;
+    using ALayout     = std::tuple_element_t<5, Tuple>;
+    using BLayout     = std::tuple_element_t<6, Tuple>;
+    using D0Layout    = std::tuple_element_t<7, Tuple>;
+    using ELayout     = std::tuple_element_t<8, Tuple>;
+
+    using ProfileCall = bool (*const)(int, int, bool, bool, int, int, int, int, int, int, int);
+
+    virtual ProfileCall GetImpl() = 0;
+
+    void Run(const TestMatrixSizes& lengths = DefaultTestMatrixSizes)
+    {
+        bool all_success = true;
+
+        for(auto length : lengths)
+        {
+            int M        = length[0];
+            int N        = length[1];
+            int K        = length[2];
+            int StrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+            int StrideB  = ck::is_same_v<BLayout, Row> ? N : K;
+            int StrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+            int StrideE  = ck::is_same_v<ELayout, Row> ? N : M;
+
+            all_success =
+                all_success &
+                GetImpl()(1, 1, false, true, M, N, K, StrideA, StrideB, StrideD0, StrideE);
+        }
+
+        EXPECT_TRUE(all_success);
+    }
+};
+
+template <typename Tuple>
+class TestGemmD0D1Common : public ::testing::Test
+{
+    protected:
+    using ADataType   = std::tuple_element_t<0, Tuple>;
+    using BDataType   = std::tuple_element_t<1, Tuple>;
+    using AccDataType = std::tuple_element_t<2, Tuple>;
+    using D0DataType  = std::tuple_element_t<3, Tuple>;
+    using D1DataType  = std::tuple_element_t<4, Tuple>;
+    using EDataType   = std::tuple_element_t<5, Tuple>;
+    using ALayout     = std::tuple_element_t<6, Tuple>;
+    using BLayout     = std::tuple_element_t<7, Tuple>;
+    using D0Layout    = std::tuple_element_t<8, Tuple>;
+    using D1Layout    = std::tuple_element_t<9, Tuple>;
+    using ELayout     = std::tuple_element_t<10, Tuple>;
+
+    using ProfileCall = bool (*const)(int, int, bool, bool, int, int, int, int, int, int, int, int);
+
+    virtual ProfileCall GetImpl() = 0;
+
+    void Run(const TestMatrixSizes& lengths = DefaultTestMatrixSizes)
+    {
+        bool all_success = true;
+
+        for(auto length : lengths)
+        {
+            int M        = length[0];
+            int N        = length[1];
+            int K        = length[2];
+            int StrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+            int StrideB  = ck::is_same_v<BLayout, Row> ? N : K;
+            int StrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+            int StrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
+            int StrideE  = ck::is_same_v<ELayout, Row> ? N : M;
+
+            all_success =
+                all_success &
+                GetImpl()(
+                    1, 1, false, true, M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE);
+        }
+
+        EXPECT_TRUE(all_success);
+    }
+};
diff --git a/test/gemm_add/test_gemm_fastgelu_wmma.cpp b/test/gemm_add/test_gemm_fastgelu_wmma.cpp
new file mode 100644
index 0000000000..d8dd218ec6
--- /dev/null
+++ b/test/gemm_add/test_gemm_fastgelu_wmma.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_fastgelu_impl.hpp"
+#include "test_gemm_common.hpp"
+
+template <typename Tuple>
+class TestGemmFastgelu : public TestGemmCommon<Tuple>
+{
+    using ProfileCall = typename TestGemmCommon<Tuple>::ProfileCall;
+
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_fastgelu_impl<typename TestGemmCommon<Tuple>::ADataType,
+                                                        typename TestGemmCommon<Tuple>::BDataType,
+                                                        typename TestGemmCommon<Tuple>::AccDataType,
+                                                        typename TestGemmCommon<Tuple>::EDataType,
+                                                        typename TestGemmCommon<Tuple>::ALayout,
+                                                        typename TestGemmCommon<Tuple>::BLayout,
+                                                        typename TestGemmCommon<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, Row, Col, Row>,
+                                     std::tuple<F16, F16, F32, F16, Col, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, Col, Col, Row>>;
+
+TYPED_TEST_SUITE(TestGemmFastgelu, KernelTypes);
+TYPED_TEST(TestGemmFastgelu, Test_BF16FP16) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_multiply_add_wmma.cpp b/test/gemm_add/test_gemm_multiply_add_wmma.cpp
new file mode 100644
index 0000000000..2531464f72
--- /dev/null
+++ b/test/gemm_add/test_gemm_multiply_add_wmma.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_gemm_common.hpp"
+#include "profiler/profile_gemm_multiply_add_impl.hpp"
+
+template <typename Tuple>
+class TestGemmMultiplyAdd : public TestGemmD0D1Common<Tuple>
+{
+    using ProfileCall = typename TestGemmD0D1Common<Tuple>::ProfileCall;
+
+    ProfileCall GetImpl() override
+    {
+        return ck::profiler::profile_gemm_multiply_add_impl<
+            typename TestGemmD0D1Common<Tuple>::ADataType,
+            typename TestGemmD0D1Common<Tuple>::BDataType,
+            typename TestGemmD0D1Common<Tuple>::AccDataType,
+            typename TestGemmD0D1Common<Tuple>::D0DataType,
+            typename TestGemmD0D1Common<Tuple>::D1DataType,
+            typename TestGemmD0D1Common<Tuple>::EDataType,
+            typename TestGemmD0D1Common<Tuple>::ALayout,
+            typename TestGemmD0D1Common<Tuple>::BLayout,
+            typename TestGemmD0D1Common<Tuple>::D0Layout,
+            typename TestGemmD0D1Common<Tuple>::D1Layout,
+            typename TestGemmD0D1Common<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes = ::testing::Types<
+#ifdef CK_USE_WMMA_FP8
+    std::tuple<F16, F8, F32, F32, F32, F16, Row, Col, Row, Row, Row>,
+    std::tuple<F16, F8, F32, F32, F32, F16, Row, Row, Row, Row, Row>,
+#endif
+    std::tuple<F16, F16, F32, F16, F16, F16, Row, Col, Row, Row, Row>,
+    std::tuple<F16, F16, F32, F16, F16, F16, Row, Row, Row, Row, Row>>;
+
+TYPED_TEST_SUITE(TestGemmMultiplyAdd, KernelTypes);
+// Due to F16 shuffle data type tests has to run with limited K size. Change instances to FP32?
+TYPED_TEST(TestGemmMultiplyAdd, Test) { this->Run({{16, 32, 64}, {2048, 1024, 256}}); }
diff --git a/test/gemm_add/test_gemm_multiply_multiply_wmma.cpp b/test/gemm_add/test_gemm_multiply_multiply_wmma.cpp
new file mode 100644
index 0000000000..74e900c43f
--- /dev/null
+++ b/test/gemm_add/test_gemm_multiply_multiply_wmma.cpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_multiply_multiply_impl.hpp"
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using I8   = int8_t;
+using I32  = int32_t;
+using F8   = ck::f8_t;
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+template <typename Tuple>
+class TestGemmMultiplyMultiply : public ::testing::Test
+{
+    private:
+    using ADataType   = std::tuple_element_t<0, Tuple>;
+    using BDataType   = std::tuple_element_t<1, Tuple>;
+    using AccDataType = std::tuple_element_t<2, Tuple>;
+    using D0DataType  = std::tuple_element_t<3, Tuple>;
+    using D1DataType  = std::tuple_element_t<4, Tuple>;
+    using EDataType   = std::tuple_element_t<5, Tuple>;
+    using ALayout     = std::tuple_element_t<6, Tuple>;
+    using BLayout     = std::tuple_element_t<7, Tuple>;
+    using D0Layout    = std::tuple_element_t<8, Tuple>;
+    using D1Layout    = std::tuple_element_t<9, Tuple>;
+    using ELayout     = std::tuple_element_t<10, Tuple>;
+
+    constexpr static auto ProfileGemmMultiplyMultiplyImpl =
+        ck::profiler::profile_gemm_multiply_multiply_impl<ADataType,
+                                                          BDataType,
+                                                          AccDataType, // ComputeDataType for
+                                                                       // reference gemm
+                                                          AccDataType,
+                                                          D0DataType,
+                                                          D1DataType,
+                                                          EDataType,
+                                                          ALayout,
+                                                          BLayout,
+                                                          D0Layout,
+                                                          D1Layout,
+                                                          ELayout>;
+
+    public:
+    void Run()
+    {
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {16, 32, 64}, {512, 2048, 4096}, {2048, 1024, 16}};
+
+        bool all_success = true;
+
+        for(auto length : lengths)
+        {
+            int M        = length[0];
+            int N        = length[1];
+            int K        = length[2];
+            int StrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+            int StrideB  = ck::is_same_v<BLayout, Row> ? N : K;
+            int StrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+            int StrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
+            int StrideE  = ck::is_same_v<ELayout, Row> ? N : M;
+
+            all_success = all_success & ProfileGemmMultiplyMultiplyImpl(1,
+                                                                        1,
+                                                                        false,
+                                                                        true,
+                                                                        M,
+                                                                        N,
+                                                                        K,
+                                                                        StrideA,
+                                                                        StrideB,
+                                                                        StrideD0,
+                                                                        StrideD1,
+                                                                        StrideE,
+                                                                        1,
+                                                                        1,
+                                                                        1,
+                                                                        0);
+        }
+
+        EXPECT_TRUE(all_success);
+    }
+};
+
+using KernelTypes = ::testing::Types<
+#ifdef CK_USE_WMMA_FP8
+    std::tuple<F8, F8, F32, F32, F32, F16, Row, Col, Row, Col, Row>,
+    std::tuple<F8, F8, F32, F32, F32, BF16, Row, Col, Row, Col, Row>,
+#endif
+    std::tuple<I8, I8, I32, F16, F16, F16, Row, Col, Row, Col, Row>,
+    std::tuple<I8, I8, I32, F32, F32, BF16, Row, Col, Row, Col, Row>>;
+
+TYPED_TEST_SUITE(TestGemmMultiplyMultiply, KernelTypes);
+TYPED_TEST(TestGemmMultiplyMultiply, Test) { this->Run(); }
diff --git a/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16_xdl.cpp b/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16_xdl.cpp
index 3f05996878..ae872d3133 100644
--- a/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16_xdl.cpp
+++ b/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16_xdl.cpp
@@ -1,7 +1,9 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "ck/host_utility/device_prop.hpp"
 #include "profiler/profile_gemm_add_relu_add_layernorm_impl.hpp"
 
 using Row = ck::tensor_layout::gemm::RowMajor;
@@ -75,3 +77,13 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestGemmAddReluAddLayernorm, KernelTypes);
 TYPED_TEST(TestGemmAddReluAddLayernorm, Test_FP16) { this->Run(); }
+int main(int argc, char** argv)
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        std::cout << "No available instance for gfx11 & gfx12." << std::endl;
+        return 0;
+    }
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_multi_abd/CMakeLists.txt b/test/gemm_multi_abd/CMakeLists.txt
new file mode 100644
index 0000000000..d700414b05
--- /dev/null
+++ b/test/gemm_multi_abd/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_gtest_executable(test_gemm_multi_abd_wmma test_gemm_multi_abd_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_multi_abd_wmma PRIVATE utility device_gemm_multi_abd_instance)
+endif()
+
+add_gtest_executable(test_gemm_multi_abd_xdl test_gemm_multi_abd_xdl.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_multi_abd_xdl PRIVATE utility device_gemm_multi_abd_instance)
+endif()
diff --git a/test/gemm_multi_abd/test_gemm_common.hpp b/test/gemm_multi_abd/test_gemm_common.hpp
new file mode 100644
index 0000000000..030fbcac77
--- /dev/null
+++ b/test/gemm_multi_abd/test_gemm_common.hpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+
+namespace ck {
+namespace test {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using F32 = float;
+
+template <typename Tuple>
+class TestGemmCommon : public ::testing::Test
+{
+    protected:
+    using AsLayout     = std::tuple_element_t<0, Tuple>;
+    using BsLayout     = std::tuple_element_t<1, Tuple>;
+    using DsLayout     = std::tuple_element_t<2, Tuple>;
+    using ELayout      = Row;
+    using AsDataType   = std::tuple_element_t<3, Tuple>;
+    using BsDataType   = std::tuple_element_t<4, Tuple>;
+    using DsDataType   = std::tuple_element_t<5, Tuple>;
+    using EDataType    = std::tuple_element_t<6, Tuple>;
+    using AElementOp   = std::tuple_element_t<7, Tuple>;
+    using BElementOp   = std::tuple_element_t<8, Tuple>;
+    using CDEElementOp = std::tuple_element_t<9, Tuple>;
+
+    void Run()
+    {
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {16, 32, 64}, {512, 1024, 2048}, {1024, 512, 32}};
+
+        bool all_success = true;
+
+        for(auto length : lengths)
+        {
+            int M = length[0];
+            int N = length[1];
+            int K = length[2];
+            // Assuming same layout for all A matrices (same applies for Bs and Ds)
+            int StrideA = ck::is_same_v<remove_cvref_t<tuple_element_t<0, AsLayout>>, Row> ? K : M;
+            int StrideB = ck::is_same_v<remove_cvref_t<tuple_element_t<0, BsLayout>>, Row> ? N : K;
+            // In case no D matrices are provided, set stride to 0
+            int StrideD = 0;
+            if constexpr(DsDataType::Size() > 0)
+            {
+                StrideD = ck::is_same_v<remove_cvref_t<tuple_element_t<0, DsLayout>>, Row> ? N : M;
+            }
+            int StrideE = ck::is_same_v<ELayout, Row> ? N : M;
+
+            all_success =
+                all_success & ck::profiler::profile_gemm_multi_abd_impl<AsDataType,
+                                                                        BsDataType,
+                                                                        F32,
+                                                                        DsDataType,
+                                                                        EDataType,
+                                                                        AsLayout,
+                                                                        BsLayout,
+                                                                        DsLayout,
+                                                                        ELayout,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>(
+                                  1, 2, false, false, M, N, K, StrideA, StrideB, StrideD, StrideE);
+        }
+
+        EXPECT_TRUE(all_success);
+    }
+};
+
+} // namespace test
+} // namespace ck
diff --git a/test/gemm_multi_abd/test_gemm_multi_abd_wmma.cpp b/test/gemm_multi_abd/test_gemm_multi_abd_wmma.cpp
new file mode 100644
index 0000000000..42584ecc02
--- /dev/null
+++ b/test/gemm_multi_abd/test_gemm_multi_abd_wmma.cpp
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_multi_abd_impl.hpp"
+#include "test_gemm_common.hpp"
+
+namespace ck {
+namespace test {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using I8   = int8_t;
+using BF16 = ck::bhalf_t;
+
+using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
+using Multiply            = ck::tensor_operation::element_wise::Multiply;
+using Add                 = ck::tensor_operation::element_wise::Add;
+using MultiplyAdd         = ck::tensor_operation::element_wise::MultiplyAdd;
+using FastGelu            = ck::tensor_operation::element_wise::FastGelu;
+using AddFastGelu         = ck::tensor_operation::element_wise::AddFastGelu;
+using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
+using MultiplyFastGelu    = ck::tensor_operation::element_wise::MultiplyFastGelu;
+
+using KernelTypesABD = ::testing::Types<std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   Add>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Col, Col>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   Add>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   AddFastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Col, Col>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   AddFastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   FastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Col, Col>,
+                                                   ck::Tuple<>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   FastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   PassThrough>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Col, Col>,
+                                                   ck::Tuple<>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   PassThrough>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8>,
+                                                   ck::Tuple<BF16, BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   MultiplyAddFastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8>,
+                                                   ck::Tuple<BF16, BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   MultiplyAdd>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   MultiplyFastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   Multiply>>;
+
+TYPED_TEST_SUITE(TestGemmCommon, KernelTypesABD);
+TYPED_TEST(TestGemmCommon, Test_BF16I8BF16) { this->Run(); }
+
+} // namespace test
+} // namespace ck
diff --git a/test/gemm_multi_abd/test_gemm_multi_abd_xdl.cpp b/test/gemm_multi_abd/test_gemm_multi_abd_xdl.cpp
new file mode 100644
index 0000000000..42584ecc02
--- /dev/null
+++ b/test/gemm_multi_abd/test_gemm_multi_abd_xdl.cpp
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_multi_abd_impl.hpp"
+#include "test_gemm_common.hpp"
+
+namespace ck {
+namespace test {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using I8   = int8_t;
+using BF16 = ck::bhalf_t;
+
+using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
+using Multiply            = ck::tensor_operation::element_wise::Multiply;
+using Add                 = ck::tensor_operation::element_wise::Add;
+using MultiplyAdd         = ck::tensor_operation::element_wise::MultiplyAdd;
+using FastGelu            = ck::tensor_operation::element_wise::FastGelu;
+using AddFastGelu         = ck::tensor_operation::element_wise::AddFastGelu;
+using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
+using MultiplyFastGelu    = ck::tensor_operation::element_wise::MultiplyFastGelu;
+
+using KernelTypesABD = ::testing::Types<std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   Add>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Col, Col>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   Add>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   AddFastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Col, Col>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   AddFastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   FastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Col, Col>,
+                                                   ck::Tuple<>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   FastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   PassThrough>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Col, Col>,
+                                                   ck::Tuple<>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8, BF16>,
+                                                   ck::Tuple<>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   Multiply,
+                                                   PassThrough>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8>,
+                                                   ck::Tuple<BF16, BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   MultiplyAddFastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<Row, Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8>,
+                                                   ck::Tuple<BF16, BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   MultiplyAdd>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   MultiplyFastGelu>,
+                                        std::tuple<ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<Row>,
+                                                   ck::Tuple<BF16>,
+                                                   ck::Tuple<I8>,
+                                                   ck::Tuple<BF16>,
+                                                   BF16,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   Multiply>>;
+
+TYPED_TEST_SUITE(TestGemmCommon, KernelTypesABD);
+TYPED_TEST(TestGemmCommon, Test_BF16I8BF16) { this->Run(); }
+
+} // namespace test
+} // namespace ck
diff --git a/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp b/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp
index 35a149f52c..b1f2c36c9f 100644
--- a/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp
@@ -4,9 +4,20 @@
 #include <iostream>
 
 #include "profiler/profile_gemm_reduce_impl.hpp"
-
-int main()
+static ck::index_t instance_index = -1;
+int main(int argc, char** argv)
 {
+    if(argc == 1) {}
+    else if(argc == 2)
+    {
+        instance_index = atoi(argv[1]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1: instance_index(-1 means all)" << std::endl;
+    }
+
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
 
@@ -19,22 +30,22 @@ int main()
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Row, Row>(
-                   true, 1, false, false, M, N, K, K, N, N);
+                   true, 1, false, false, M, N, K, K, N, N, instance_index);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Col, Row>(
-                   true, 1, false, false, M, N, K, K, K, N);
+                   true, 1, false, false, M, N, K, K, K, N, instance_index);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Row, Row>(
-                   true, 1, false, false, M, N, K, M, N, N);
+                   true, 1, false, false, M, N, K, M, N, N, instance_index);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Col, Row>(
-                   true, 1, false, false, M, N, K, M, K, N);
+                   true, 1, false, false, M, N, K, M, K, N, instance_index);
 
     if(pass)
     {
diff --git a/test/gemm_split_k/test_gemm_splitk_util.hpp b/test/gemm_split_k/test_gemm_splitk_util.hpp
index 99d9d5e832..f994f146c7 100644
--- a/test/gemm_split_k/test_gemm_splitk_util.hpp
+++ b/test/gemm_split_k/test_gemm_splitk_util.hpp
@@ -15,6 +15,8 @@
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_gemm_splitk_impl.hpp"
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 namespace ck {
 namespace test {
 
@@ -48,8 +50,13 @@ class TestGemmSplitK : public testing::Test
              const int StrideB,
              const int StrideC)
     {
-        for(auto kb : k_batches_)
+        for(size_t i = 0; i < k_batches_.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto kb = k_batches_[i];
             RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
         }
     }
@@ -82,7 +89,8 @@ class TestGemmSplitK : public testing::Test
                                                                     StrideC,
                                                                     kbatch,
                                                                     n_warmup,
-                                                                    n_iter);
+                                                                    n_iter,
+                                                                    instance_index);
         EXPECT_TRUE(pass);
     }
 };
diff --git a/test/gemm_split_k/test_gemm_splitk_xdl.cpp b/test/gemm_split_k/test_gemm_splitk_xdl.cpp
index 9eba5bba37..3ff32977fa 100644
--- a/test/gemm_split_k/test_gemm_splitk_xdl.cpp
+++ b/test/gemm_split_k/test_gemm_splitk_xdl.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_splitk_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using F16 = ck::half_t;
 using F32 = float;
 
@@ -64,3 +67,20 @@ TYPED_TEST_SUITE(TestGemmSplitK_KM_KN, KernelTypes);
 TYPED_TEST_SUITE(TestGemmSplitK_KM_NK, KernelTypes);
 
 #include "test_gemm_splitk_ut_cases.inc"
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_util.hpp b/test/gemm_universal/test_gemm_universal_util.hpp
index cb9bd4743d..12835805b3 100644
--- a/test/gemm_universal/test_gemm_universal_util.hpp
+++ b/test/gemm_universal/test_gemm_universal_util.hpp
@@ -14,7 +14,8 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_gemm_universal_impl.hpp"
-
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 namespace ck {
 namespace test {
 
@@ -49,8 +50,13 @@ class TestGemmUniversal : public testing::Test
              const int StrideB,
              const int StrideC)
     {
-        for(auto kb : k_batches_)
+        for(size_t i = 0; i < k_batches_.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto kb = k_batches_[i];
             RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
         }
     }
@@ -84,7 +90,8 @@ class TestGemmUniversal : public testing::Test
                                                                        StrideC,
                                                                        kbatch,
                                                                        n_warmup,
-                                                                       n_iter);
+                                                                       n_iter,
+                                                                       instance_index);
         EXPECT_TRUE(pass);
     }
 };
diff --git a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
index 311c4de32d..5e7aa7ddc7 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
@@ -6,10 +6,11 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using I4   = ck::pk_i4_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using I4                   = ck::pk_i4_t;
+using BF16                 = ck::bhalf_t;
+using F32                  = float;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -85,3 +86,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
 
 #include "test_gemm_universal_ut_cases_bf16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
index 2f51253766..e530e5bbc2 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
@@ -6,10 +6,11 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using I4  = ck::pk_i4_t;
-using F8  = ck::f8_t;
-using F16 = ck::half_t;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using I4                   = ck::pk_i4_t;
+using F8                   = ck::f8_t;
+using F16                  = ck::half_t;
 
 using F32 = float;
 
@@ -99,3 +100,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_NK, KernelTypes_KM_NK);
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_KN, KernelTypes_KM_KN);
 
 #include "test_gemm_universal_ut_cases_fp16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
index 3484d49b93..81695258f6 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
@@ -6,7 +6,8 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 #if defined(CK_USE_WMMA_FP8)
 
 using F8   = ck::f8_t;
@@ -59,3 +60,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
 #include "test_gemm_universal_ut_cases_fp8.inc"
 
 #endif // CK_USE_WMMA_FP8
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
index 8fde65657a..9e643df7b8 100644
--- a/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
@@ -6,9 +6,10 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using BF16 = ck::bhalf_t;
-using F32  = float;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using BF16                 = ck::bhalf_t;
+using F32                  = float;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -80,3 +81,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
 
 #include "test_gemm_universal_ut_cases_bf16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
index 4eafb8c2e3..cabf6fb38d 100644
--- a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
@@ -6,9 +6,10 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using F8  = ck::f8_t;
-using F16 = ck::half_t;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using F8                   = ck::f8_t;
+using F16                  = ck::half_t;
 
 using F32 = float;
 
@@ -92,3 +93,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_NK, KernelTypes_KM_NK);
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_KN, KernelTypes_KM_KN);
 
 #include "test_gemm_universal_ut_cases_fp16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
index e833ab7825..d99f25eb12 100644
--- a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
@@ -6,11 +6,12 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using F8   = ck::f8_t;
-using F16  = ck::half_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using F8                   = ck::f8_t;
+using F16                  = ck::half_t;
+using BF16                 = ck::bhalf_t;
+using F32                  = float;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -69,3 +70,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
 
 
 #include "test_gemm_universal_ut_cases_fp8.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal_reduce/CMakeLists.txt b/test/gemm_universal_reduce/CMakeLists.txt
new file mode 100644
index 0000000000..dab9de44c0
--- /dev/null
+++ b/test/gemm_universal_reduce/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_gtest_executable(test_gemm_universal_reduce_bf16_wmma test_gemm_universal_reduce_bf16_wmma.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_gemm_universal_reduce_bf16_wmma PRIVATE utility device_gemm_universal_reduce_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_reduce_fp16_wmma test_gemm_universal_reduce_fp16_wmma.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_gemm_universal_reduce_fp16_wmma PRIVATE utility device_gemm_universal_reduce_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_reduce_bf16A_i8_wmma test_gemm_universal_reduce_bf16A_i8_wmma.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_gemm_universal_reduce_bf16A_i8_wmma PRIVATE utility device_gemm_universal_reduce_instance)
+endif()
diff --git a/test/gemm_universal_reduce/test_gemm_universal_reduce_bf16A_i8_wmma.cpp b/test/gemm_universal_reduce/test_gemm_universal_reduce_bf16A_i8_wmma.cpp
new file mode 100644
index 0000000000..ec4c0dc784
--- /dev/null
+++ b/test/gemm_universal_reduce/test_gemm_universal_reduce_bf16A_i8_wmma.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "profiler/profile_gemm_universal_reduce_impl.hpp"
+
+TEST(GemmUniversalReduce, BF16A_I8)
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+
+    int M      = 512;
+    int N      = 256;
+    int K      = 128;
+    int KBatch = 1;
+
+    bool pass = true;
+
+    pass = pass && ck::profiler::profile_gemm_universal_reduce_impl<ck::bhalf_t,
+                                                                    int8_t,
+                                                                    ck::Tuple<>,
+                                                                    float,
+                                                                    ck::bhalf_t,
+                                                                    Row,
+                                                                    Row,
+                                                                    ck::Tuple<>,
+                                                                    Row>(
+                       true, 3, false, true, M, N, K, K, N, N, KBatch, 1, 10);
+    EXPECT_TRUE(pass);
+}
diff --git a/test/gemm_universal_reduce/test_gemm_universal_reduce_bf16_wmma.cpp b/test/gemm_universal_reduce/test_gemm_universal_reduce_bf16_wmma.cpp
new file mode 100644
index 0000000000..cbc7860fd9
--- /dev/null
+++ b/test/gemm_universal_reduce/test_gemm_universal_reduce_bf16_wmma.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "profiler/profile_gemm_universal_reduce_impl.hpp"
+
+TEST(GemmUniversalReduce, BF16)
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+
+    int M      = 512;
+    int N      = 256;
+    int K      = 128;
+    int KBatch = 1;
+
+    bool pass = true;
+
+    pass = pass && ck::profiler::profile_gemm_universal_reduce_impl<ck::bhalf_t,
+                                                                    ck::bhalf_t,
+                                                                    ck::Tuple<>,
+                                                                    float,
+                                                                    ck::bhalf_t,
+                                                                    Row,
+                                                                    Row,
+                                                                    ck::Tuple<>,
+                                                                    Row>(
+                       true, 1, false, true, M, N, K, K, N, N, KBatch, 1, 10);
+    EXPECT_TRUE(pass);
+}
diff --git a/test/gemm_universal_reduce/test_gemm_universal_reduce_fp16_wmma.cpp b/test/gemm_universal_reduce/test_gemm_universal_reduce_fp16_wmma.cpp
new file mode 100644
index 0000000000..731bee89ed
--- /dev/null
+++ b/test/gemm_universal_reduce/test_gemm_universal_reduce_fp16_wmma.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "profiler/profile_gemm_universal_reduce_impl.hpp"
+
+TEST(GemmUniversalReduce, FP16)
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+
+    int M      = 512;
+    int N      = 256;
+    int K      = 128;
+    int KBatch = 1;
+
+    bool pass = true;
+
+    pass = pass && ck::profiler::profile_gemm_universal_reduce_impl<ck::half_t,
+                                                                    ck::half_t,
+                                                                    ck::Tuple<>,
+                                                                    float,
+                                                                    ck::half_t,
+                                                                    Row,
+                                                                    Row,
+                                                                    ck::Tuple<>,
+                                                                    Row>(
+                       true, 1, false, true, M, N, K, K, N, N, KBatch, 1, 10);
+    EXPECT_TRUE(pass);
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
index 805587a274..6fdf8aa71f 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
@@ -15,6 +15,9 @@
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_gemm_universal_streamk_impl.hpp"
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
+
 namespace ck {
 namespace test {
 
@@ -56,8 +59,13 @@ class TestGemmUniversal_Streamk : public testing::Test
              const int StrideB,
              const int StrideC)
     {
-        for(auto streamk_sel : streamk_sel_list)
+        for(size_t i = 0; i < streamk_sel_list.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto streamk_sel = streamk_sel_list[i];
             RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, -1);
         }
     }
@@ -93,7 +101,8 @@ class TestGemmUniversal_Streamk : public testing::Test
                                                                                streamk_sel,
                                                                                Grid_size,
                                                                                n_warmup,
-                                                                               n_iter);
+                                                                               n_iter,
+                                                                               instance_index);
         EXPECT_TRUE(pass);
     }
 };
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
index 1aef74cf18..5675413862 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_streamk_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using BF16 = ck::bhalf_t;
 using F32  = float;
 
@@ -83,3 +86,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_KN, KernelTypes_KM_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_NK, KernelTypes_KM_NK);
 
 #include "test_gemm_universal_streamk_ut_cases_bf16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
index 43b122ff0d..b6262c95c9 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_streamk_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using F8  = ck::f8_t;
 using F16 = ck::half_t;
 
@@ -82,3 +85,20 @@ TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_KN, KernelTypes_MK_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_NK, KernelTypes_MK_NK);
 
 #include "test_gemm_universal_streamk_ut_cases_fp16.inc"
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
index 3836de056c..a9ea93bfa6 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_streamk_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using F8   = ck::f8_t;
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
@@ -72,3 +75,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_KN, KernelTypes_MK_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_NK, KernelTypes_MK_NK);
 
 #include "test_gemm_universal_streamk_ut_cases_fp8.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt
index 5c816da416..dfd08bc42e 100644
--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -2,7 +2,7 @@ add_gtest_executable(test_grouped_convnd_bwd_data_xdl test_grouped_convnd_bwd_da
 if(result EQUAL 0)
     target_link_libraries(test_grouped_convnd_bwd_data_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
 endif()
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_executable(test_grouped_convnd_bwd_data_xdl_large_cases test_grouped_convnd_bwd_data_xdl_large_cases.cpp)
     target_compile_options(test_grouped_convnd_bwd_data_xdl_large_cases PRIVATE -Wno-global-constructors -Wno-undef)
     target_link_libraries(test_grouped_convnd_bwd_data_xdl_large_cases PRIVATE gtest_main getopt::getopt utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp
index bc592ba665..01f4260c43 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -47,7 +47,7 @@ class TestGroupedConvndBwdData : public ::testing::Test
     // ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|         Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
     // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
     // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
-    < NDimSpatial, OutLayout, WeiLayout, ck::Tuple<>, InLayout, DataType, DataType, AccDataType, DataType,   ck::Tuple<>, DataType, Pass, Pass,    Pass,  ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>;
+    < NDimSpatial, OutLayout, WeiLayout, ck::Tuple<>, InLayout, DataType, DataType, AccDataType, DataType,   ck::Tuple<>, DataType, Pass, Pass,    Pass,  ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   16,   16,       4,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4>;
     // clang-format on
 
     ck::utils::conv::ConvParam conv_param;
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
index 7ad7b78d6f..f335183a52 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
@@ -11,6 +11,9 @@
 
 #include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupedConvndBwdDataWmma : public ::testing::Test
 {
@@ -27,20 +30,27 @@ class TestGroupedConvndBwdDataWmma : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
-                                                                            OutLayout,
-                                                                            WeiLayout,
-                                                                            InLayout,
-                                                                            DataType,
-                                                                            DataType,
-                                                                            DataType>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                   OutLayout,
+                                                                                   WeiLayout,
+                                                                                   InLayout,
+                                                                                   DataType,
+                                                                                   DataType,
+                                                                                   DataType>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               1, // splitK
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -106,3 +116,20 @@ TYPED_TEST(TestGroupedConvndBwdDataWmma3d, Test3D)
         {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
index 209b9b4f55..17839887bb 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -11,6 +11,9 @@
 
 #include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
 
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupedConvndBwdDataXdl : public ::testing::Test
 {
@@ -30,21 +33,27 @@ class TestGroupedConvndBwdDataXdl : public ::testing::Test
         bool pass = true;
         for(auto split_k : split_ks)
         {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
             {
-                pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
-                                                                                OutLayout,
-                                                                                WeiLayout,
-                                                                                InLayout,
-                                                                                DataType,
-                                                                                DataType,
-                                                                                DataType>(
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
+                pass        = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                       OutLayout,
+                                                                                       WeiLayout,
+                                                                                       InLayout,
+                                                                                       DataType,
+                                                                                       DataType,
+                                                                                       DataType>(
                                    true,  // do_verification
                                    1,     // init_method: integer value
                                    false, // do_log
                                    false, // time_kernel
                                    param,
-                                   split_k);
+                                   split_k,
+                                   instance_index);
             }
         }
         EXPECT_TRUE(pass);
@@ -149,3 +158,19 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
         {3, 1, 1, 1, 1, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_bwd_weight/CMakeLists.txt b/test/grouped_convnd_bwd_weight/CMakeLists.txt
index 2db0fb1cf3..7c2b208c6b 100644
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
    add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
    target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance device_grouped_convnd_bwd_weight_instance)
 elseif(DL_KERNELS)
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp b/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
index 11748d4717..b3ed49ed8c 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
@@ -23,6 +23,8 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestGroupedConvndBwdWeight : public ::testing::Test
 {
@@ -83,7 +85,8 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
     }
 
     bool PerformConvWeightBilinear(ck::utils::conv::ConvParam& conv_param,
-                                   const ck::index_t split_k)
+                                   const ck::index_t split_k,
+                                   ck::index_t instance_index_ = -1)
     {
         bool passed = true;
 
@@ -163,6 +166,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
         // get device op instances
         const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
             DeviceOp>::GetInstances();
+        int num_kernel = 0;
 
         for(std::size_t i = 0; i < op_ptrs.size(); ++i)
         {
@@ -197,6 +201,12 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
 
             if(op_ptr->IsSupportedArgument(argument_ptr.get()))
             {
+                ++num_kernel;
+                if((instance_index_ != -1) && (instance_index_ + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
                 float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr});
                 wei_device_buf.FromDevice(wei_device.mData.data());
                 passed &= ck::utils::check_err(wei_device, wei_host, "Error: incorrect results!");
@@ -218,6 +228,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
                 std::cerr << op_name << " does not support this problem" << std::endl;
             }
         }
+        if(instance_index != -1)
+        {
+            std::cout << "grouped_conv_bwd_weight_instance (" << instance_index << "/" << num_kernel
+                      << "): Passed" << std::endl;
+        }
         return passed;
     }
 
@@ -228,9 +243,14 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
 
         for(auto split_k : split_ks)
         {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
             {
-                pass = pass && PerformConvWeightBilinear(param, split_k);
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
+                pass        = pass && PerformConvWeightBilinear(param, split_k, instance_index);
             }
         }
         EXPECT_TRUE(pass);
@@ -268,3 +288,20 @@ TYPED_TEST(TestGroupedConvndBwdWeight3d, Test3D)
         {3, 1, 1, 4, 4, {3, 3, 3}, {14, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
index 8343629f3a..f0b3b28020 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -15,6 +15,9 @@
 
 #include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 using namespace ck::tensor_layout::convolution;
 
 template <typename Tuple>
@@ -92,8 +95,13 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
 
         for(auto split_k : split_ks)
         {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
             {
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
                 if(!skip_case(split_k))
                 {
                     pass = pass && ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial{},
@@ -108,7 +116,8 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
                                        false, // do_log
                                        false, // time_kernel
                                        param,
-                                       std::to_string(split_k));
+                                       std::to_string(split_k),
+                                       instance_index);
                 }
             }
         }
@@ -224,3 +233,20 @@ TYPED_TEST(TestGroupedConvndBwdWeight3d, Test3D)
         {3, 16, 16, 1, 1, {3, 3, 3}, {28, 28, 28}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp
index 2ad1cd11f0..354d1fc23b 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -48,11 +48,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
         //##########|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
         //##########| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
         //##########|        |         |          |          |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        <         NDimSpatial,  InLayout, WeiLayout,OutLayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>;
+        <         NDimSpatial,  InLayout, WeiLayout,OutLayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   16,   16,    2,    4,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 8>,               4>;
     // clang-format on
 
     ck::utils::conv::ConvParam conv_param;
-    std::vector<ck::index_t> split_ks{-1, 2};
+    ck::index_t split_k_ = 2;
 
     template <ck::index_t NDimSpatial>
     bool Run()
@@ -96,30 +96,24 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
 
         auto conv = GroupedConvBwdWeightDeviceInstance{};
 
-        bool is_supported = true;
-
-        for(const auto split_k : split_ks)
-        {
-            auto argument = conv.MakeArgument(nullptr,
-                                              nullptr,
-                                              nullptr,
-                                              input_lengths,
-                                              input_strides,
-                                              filter_lengths,
-                                              weights_strides,
-                                              output_lengths,
-                                              output_strides,
-                                              conv_filter_strides,
-                                              conv_filter_dilations,
-                                              input_left_pads,
-                                              input_right_pads,
-                                              PassThrough{},
-                                              PassThrough{},
-                                              PassThrough{},
-                                              split_k);
-            is_supported &= conv.IsSupportedArgument(argument);
-        }
-        return is_supported;
+        auto argument = conv.MakeArgument(nullptr,
+                                          nullptr,
+                                          nullptr,
+                                          input_lengths,
+                                          input_strides,
+                                          filter_lengths,
+                                          weights_strides,
+                                          output_lengths,
+                                          output_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          PassThrough{},
+                                          PassThrough{},
+                                          PassThrough{},
+                                          split_k_);
+        return conv.IsSupportedArgument(argument);
     }
 };
 
@@ -183,3 +177,12 @@ TYPED_TEST(TestGroupedConvndBwdWeightDefault, VectorLoadCheck)
     is_supported     = this->template Run<2>();
     EXPECT_FALSE(is_supported);
 }
+
+TYPED_TEST(TestGroupedConvndBwdWeightDefault, SingleStageAutoDeduce)
+{
+    // Supported version but with auto deduce and single stage
+    this->conv_param  = {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+    this->split_k_    = -1;
+    bool is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+}
diff --git a/test/grouped_convnd_fwd/CMakeLists.txt b/test/grouped_convnd_fwd/CMakeLists.txt
index 24622fa0b5..28583e82c7 100644
--- a/test/grouped_convnd_fwd/CMakeLists.txt
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
@@ -1,13 +1,7 @@
-if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11")
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_grouped_convnd_fwd test_grouped_convnd_fwd.cpp)
-    if((GPU_TARGETS MATCHES "gfx11") AND (NOT GPU_TARGETS MATCHES "gfx9"))
-        target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
-    else()
-        target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
-    endif()
-endif()
+    target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
 
-if(GPU_TARGETS MATCHES "gfx9")
     add_executable(test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_fwd_large_cases_xdl.cpp)
     target_compile_options(test_grouped_convnd_fwd_large_cases_xdl PRIVATE -Wno-global-constructors -Wno-undef)
     target_link_libraries(test_grouped_convnd_fwd_large_cases_xdl PRIVATE gtest_main getopt::getopt utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
index 1cf91df52c..ca78cf4af3 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -9,6 +9,9 @@
 
 #include "profiler/profile_grouped_conv_fwd_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
 {
@@ -26,23 +29,30 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
-                                                                       InLayout,
-                                                                       WeiLayout,
-                                                                       OutLayout,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       IndexType>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                              InLayout,
+                                                                              WeiLayout,
+                                                                              OutLayout,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              IndexType>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               ck::tensor_operation::element_wise::PassThrough{},
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -148,3 +158,20 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 96, 1, 1, 1, {3, 3, 3}, {4, 30, 160}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_ab_interface.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_ab_interface.cpp
index 346f04f66d..9f4c33b34e 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_ab_interface.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_ab_interface.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -70,10 +70,10 @@ class TestGroupedConvndFwdMultiABInterfaceBase : public ::testing::Test
             32,          // KPerBlock
             8,           // AK1
             8,           // BK1
-            32,          // MPerXdl
-            32,          // NPerXdl
-            2,           // MXdlPerWave
-            4,           // NXdlPerWave
+            16,          // MPerXdl
+            16,          // NPerXdl
+            4,           // MXdlPerWave
+            8,           // NXdlPerWave
             S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
             S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
             S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -91,7 +91,7 @@ class TestGroupedConvndFwdMultiABInterfaceBase : public ::testing::Test
             1,
             1,
             S<1, 32, 1, 8>,
-            8>;
+            4>;
 
     const ck::utils::conv::ConvParam conv_param{
         3, 1, 16, 16, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
@@ -172,8 +172,8 @@ class TestGroupedConvndFwdMultiABInterfaceBase : public ::testing::Test
 
 class TestGroupedConvndFwdMultiAInterface
     : public TestGroupedConvndFwdMultiABInterfaceBase<float,
-                                                      ck::Tuple<float, float>,
-                                                      float,
+                                                      ck::Tuple<ck::half_t, ck::half_t>,
+                                                      ck::half_t,
                                                       ScaleAdd,
                                                       PassThrough>
 {
@@ -181,8 +181,8 @@ class TestGroupedConvndFwdMultiAInterface
 
 class TestGroupedConvndFwdMultiBInterface
     : public TestGroupedConvndFwdMultiABInterfaceBase<float,
-                                                      float,
-                                                      ck::Tuple<float, float>,
+                                                      ck::half_t,
+                                                      ck::Tuple<ck::half_t, ck::half_t>,
                                                       PassThrough,
                                                       ScaleAdd>
 {
@@ -190,15 +190,18 @@ class TestGroupedConvndFwdMultiBInterface
 
 class TestGroupedConvndFwdMultiABInterface
     : public TestGroupedConvndFwdMultiABInterfaceBase<float,
-                                                      ck::Tuple<float, float>,
-                                                      ck::Tuple<float, float>,
+                                                      ck::Tuple<ck::half_t, ck::half_t>,
+                                                      ck::Tuple<ck::half_t, ck::half_t>,
                                                       ScaleAdd,
                                                       ScaleAdd>
 {
 };
 
-class TestGroupedConvndFwdInterface
-    : public TestGroupedConvndFwdMultiABInterfaceBase<float, float, float, PassThrough, PassThrough>
+class TestGroupedConvndFwdInterface : public TestGroupedConvndFwdMultiABInterfaceBase<float,
+                                                                                      ck::half_t,
+                                                                                      ck::half_t,
+                                                                                      PassThrough,
+                                                                                      PassThrough>
 {
 };
 
diff --git a/test/grouped_convnd_fwd_activation/CMakeLists.txt b/test/grouped_convnd_fwd_activation/CMakeLists.txt
index 4d5196505c..61e101de72 100644
--- a/test/grouped_convnd_fwd_activation/CMakeLists.txt
+++ b/test/grouped_convnd_fwd_activation/CMakeLists.txt
@@ -1,10 +1,13 @@
-if(GPU_TARGETS MATCHES "gfx9")
+if(GPU_TARGETS MATCHES "gfx9|gfx12")
+    #Fail on gfx11 CI but fail to reproduce it in local, disable it temporary
     add_gtest_executable(test_grouped_convnd_fwd_bias_bnorm_clamp test_grouped_convnd_fwd_bias_bnorm_clamp.cpp)
     target_link_libraries(test_grouped_convnd_fwd_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
 
     add_gtest_executable(test_grouped_convnd_fwd_gk_bias_bnorm_clamp test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp)
     target_link_libraries(test_grouped_convnd_fwd_gk_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
+endif()
 
+if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp)
     target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
 
@@ -13,7 +16,6 @@ if(GPU_TARGETS MATCHES "gfx9")
 
     add_gtest_executable(test_grouped_convnd_fwd_clamp test_grouped_convnd_fwd_clamp.cpp)
     target_link_libraries(test_grouped_convnd_fwd_clamp PRIVATE utility device_grouped_conv2d_fwd_clamp_instance device_grouped_conv3d_fwd_clamp_instance)
-
     add_executable(test_grouped_convnd_fwd_bias_clamp_large_cases test_grouped_convnd_fwd_bias_clamp_large_cases.cpp)
     target_compile_options(test_grouped_convnd_fwd_bias_clamp_large_cases PRIVATE -Wno-global-constructors -Wno-undef)
     target_link_libraries(test_grouped_convnd_fwd_bias_clamp_large_cases PRIVATE gtest_main getopt::getopt utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
index bf96d11d53..614f88d44e 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
@@ -11,7 +11,9 @@
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
-using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using BiasNormalizeInInferClamp   = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,8 +32,13 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                   InLayout,
                                                                                   WeiLayout,
@@ -47,7 +54,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -95,3 +103,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
index e38a6d6f6a..af4f8b67f3 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using AddClamp                    = ck::tensor_operation::element_wise::AddClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,8 +31,13 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                   InLayout,
                                                                                   WeiLayout,
@@ -47,7 +53,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -95,3 +102,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
index 7a59a95527..dcc3ec1cae 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using AddClamp                    = ck::tensor_operation::element_wise::AddClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,8 +31,13 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                   InLayout,
                                                                                   WeiLayout,
@@ -47,7 +53,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -133,3 +140,19 @@ TYPED_TEST(TestGroupedConvndFwdBiasClamp3d, Test3D)
                                  {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
index 55c2e729cd..71fc017f1e 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using Clamp = ck::tensor_operation::element_wise::Clamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using Clamp                       = ck::tensor_operation::element_wise::Clamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -31,25 +32,31 @@ class TestGroupedConvndFwd : public ::testing::Test
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
         Clamp out_element_op{0.f, 256.f};
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
-                                                                       InLayout,
-                                                                       WeiLayout,
-                                                                       OutLayout,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       IndexType,
-                                                                       Clamp>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                              InLayout,
+                                                                              WeiLayout,
+                                                                              OutLayout,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              IndexType,
+                                                                              Clamp>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
                                param,
-                               out_element_op);
+                               out_element_op,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -97,3 +104,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
index 2400008ffa..23ab359648 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using BiasNormalizeInInferClamp   = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,9 +31,14 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass &&
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass &&
                    ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                           InLayout,
                                                                           WeiLayout,
@@ -48,7 +54,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                        1,     // init_method: integer value
                        false, // do_log
                        false, // time_kernel
-                       param);
+                       param,
+                       instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -96,3 +103,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
index cd4d90e243..33273568e1 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using AddClamp                    = ck::tensor_operation::element_wise::AddClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -47,7 +48,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -95,3 +97,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_gemm/test_grouped_gemm_interface_xdl.cpp b/test/grouped_gemm/test_grouped_gemm_interface_xdl.cpp
index 3922a0b229..ef07e2c348 100644
--- a/test/grouped_gemm/test_grouped_gemm_interface_xdl.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_interface_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <stdexcept>
 #include <vector>
@@ -39,7 +39,7 @@ class TestGGemmSplitKInterface_MKNKMN : public ::testing::Test
                                                          BBlockTransferSrcScalarPerVector,
                                                          CDEBlockTransferScalarPerVector_NPerBlock>;
 
-    using DefaultGGemmInstance = GGemmInstance<GemmDefault, 32, 8, 4, 8, 8>;
+    using DefaultGGemmInstance = GGemmInstance<GemmDefault, 64, 16, 4, 8, 4>;
 };
 
 TEST_F(TestGGemmSplitKInterface_MKNKMN, TileSize)
@@ -67,7 +67,7 @@ TEST_F(TestGGemmSplitKInterface_MKNKMN, VectorLoadWidth)
 {
     static constexpr auto GemmMNKPadding =
         ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-    using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 32, 8, 4, 8, 8>;
+    using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 64, 16, 4, 8, 4>;
 
     std::vector<int> Ms{128, 256, 256, 512};
     constexpr int N = 256;
@@ -111,14 +111,17 @@ TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops)
     EXPECT_FALSE(
         DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));
 
-    Ks = std::vector<int>{256, 512, 384, 768};
-    EXPECT_TRUE(
-        DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));
+    if(!ck::is_gfx11_supported())
+    {
+        Ks = std::vector<int>{256, 512, 768, 1536};
+        EXPECT_TRUE(
+            DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));
 
-    // Not all gemms have same value for main_k0_block_loop!
-    Ks = std::vector<int>{256, 512, 512, 512};
-    EXPECT_THROW(DefaultGGemmInstance{}.Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch),
-                 std::runtime_error);
+        // Not all gemms have same value for main_k0_block_loop!
+        Ks = std::vector<int>{256, 512, 512, 512};
+        EXPECT_THROW(DefaultGGemmInstance{}.Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch),
+                     std::runtime_error);
+    }
 }
 
 class TestGGemmSplitKInterface_KMKNNM : public ::testing::Test
@@ -150,7 +153,7 @@ class TestGGemmSplitKInterface_KMKNNM : public ::testing::Test
                                                          BBlockTransferSrcScalarPerVector,
                                                          CDEBlockTransferScalarPerVector_NPerBlock>;
 
-    using DefaultGGemmInstance = GGemmInstance<GemmDefault, 32, 8, 4, 8, 4>;
+    using DefaultGGemmInstance = GGemmInstance<GemmDefault, 64, 16, 4, 8, 4>;
 };
 
 TEST_F(TestGGemmSplitKInterface_KMKNNM, TileSize)
@@ -178,7 +181,7 @@ TEST_F(TestGGemmSplitKInterface_KMKNNM, VectorLoadWidth)
 {
     static constexpr auto GemmMNKPadding =
         ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-    using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 32, 8, 2, 8, 4>;
+    using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 64, 16, 2, 8, 4>;
 
     std::vector<int> Ms{128, 256, 256, 512};
     constexpr int N = 256;
diff --git a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
index 74d49eb576..a44214be96 100644
--- a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
@@ -10,6 +10,9 @@
 #include "gtest/gtest.h"
 #include "test_grouped_gemm_util.hpp"
 
+ck::index_t param_mask     = 0xffffff;
+ck::index_t instance_index = -1;
+
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F8   = ck::f8_t;
@@ -42,3 +45,19 @@ using KernelTypes = ::testing::Types<
 TYPED_TEST_SUITE(TestGroupedGemm, KernelTypes);
 
 #include "test_grouped_gemm_ut_cases.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
index f4011cf998..3a42638e30 100644
--- a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
+++ b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
@@ -2,7 +2,7 @@
 
 TYPED_TEST(TestGroupedGemm, TinyCases)
 {
-    const std::vector<int> Ms{0, 1};
+    const std::vector<int> Ms{2, 1};
     constexpr int N = 768;
     constexpr int K = 544;
 
@@ -14,7 +14,7 @@ TYPED_TEST(TestGroupedGemm, TinyCases)
 
 TYPED_TEST(TestGroupedGemm, SmallCases)
 {
-    const std::vector<int> Ms{2, 1, 3, 4, 5, 0};
+    const std::vector<int> Ms{2, 1, 3, 4, 5};
     constexpr int N = 768;
     constexpr int K = 544;
 
diff --git a/test/grouped_gemm/test_grouped_gemm_util.hpp b/test/grouped_gemm/test_grouped_gemm_util.hpp
index a3ab0e087c..e6a9981671 100644
--- a/test/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/grouped_gemm/test_grouped_gemm_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -23,6 +23,9 @@
 #include "ck/utility/number.hpp"
 #include "profiler/profile_grouped_gemm_impl.hpp"
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
+
 namespace ck {
 namespace test {
 
@@ -109,8 +112,16 @@ class TestGroupedGemm : public testing::Test
         {
             SetStrides<ELayout>(stride_cs, Ms, Ns);
         }
+        std::vector<int> k_batches;
+        for(size_t i = 0; i < k_batches_.size(); i++)
+        {
+            if(param_mask & (1 << i))
+            {
+                k_batches.push_back(k_batches_[i]);
+            }
+        }
 
-        RunSingle(Ms, Ns, Ks, stride_as, stride_bs, stride_cs, k_batches_);
+        RunSingle(Ms, Ns, Ks, stride_as, stride_bs, stride_cs, k_batches);
     }
 
     void RunSingle(const std::vector<int>& Ms,
@@ -139,7 +150,8 @@ class TestGroupedGemm : public testing::Test
                                                                      StrideCs,
                                                                      kbatches,
                                                                      n_warmup_,
-                                                                     n_iter_);
+                                                                     n_iter_,
+                                                                     instance_index);
         EXPECT_TRUE(pass);
     }
 };
@@ -210,10 +222,10 @@ struct DeviceGroupedGemmSplitkInstanceWrapper
             KPerBlock,
             K1,
             K1,
-            32,
-            32,
+            16,
+            16,
+            8,
             4,
-            2,
             S<1, 4, 16, 1>,
             ABlockTransferThreadClusterArrageOrder,
             ABlockTransferSrcAccessOrder,
@@ -303,12 +315,19 @@ struct DeviceGroupedGemmSplitkInstanceWrapper
         {
             ggemm_instance.SetKBatchSize(&argument, kbatch);
         }
-
-        EXPECT_TRUE(ggemm_instance.IsSupportedArgument(argument));
-        auto invoker = ggemm_instance.MakeInvoker();
-        DeviceMem dev_gemm_kargs(ggemm_instance.GetDeviceKernelArgSize(&argument));
-        ggemm_instance.SetDeviceKernelArgs(&argument, dev_gemm_kargs.GetDeviceBuffer());
-        return invoker.Run(argument, StreamConfig{nullptr, false});
+        if(kbatch > 1 && ck::is_gfx11_supported())
+        {
+            EXPECT_FALSE(ggemm_instance.IsSupportedArgument(argument));
+            return 0;
+        }
+        else
+        {
+            EXPECT_TRUE(ggemm_instance.IsSupportedArgument(argument));
+            auto invoker = ggemm_instance.MakeInvoker();
+            DeviceMem dev_gemm_kargs(ggemm_instance.GetDeviceKernelArgSize(&argument));
+            ggemm_instance.SetDeviceKernelArgs(&argument, dev_gemm_kargs.GetDeviceBuffer());
+            return invoker.Run(argument, StreamConfig{nullptr, false});
+        }
     }
 };
 
diff --git a/test/magic_number_division/magic_number_division.cpp b/test/magic_number_division/magic_number_division.cpp
index 253f21e91f..d21d57a157 100644
--- a/test/magic_number_division/magic_number_division.cpp
+++ b/test/magic_number_division/magic_number_division.cpp
@@ -56,10 +56,27 @@ __host__ void cpu_magic_number_division(uint32_t magic_multiplier,
     }
 }
 
-int main(int, char*[])
+int main(int argc, char* argv[])
 {
-    uint64_t num_divisor  = 4096;
-    uint64_t num_dividend = 1L << 16;
+    uint64_t num_divisor   = 4096;
+    uint64_t num_dividend  = 1L << 16;
+    uint32_t divisor_start = 0;
+    uint32_t divisor_end   = num_divisor;
+
+    if(argc == 1)
+    {
+        // use default range
+    }
+    else if(argc == 3)
+    {
+        divisor_start = std::stoi(argv[1]);
+        divisor_end   = std::stoi(argv[2]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 2: divisor_start divisor_end" << std::endl;
+        return 1;
+    }
 
     std::vector<int32_t> divisors_host(num_divisor);
     std::vector<int32_t> dividends_host(num_dividend);
@@ -90,6 +107,10 @@ int main(int, char*[])
 
     for(std::size_t i = 0; i < num_divisor; ++i)
     {
+        if(i < divisor_start || i > divisor_end)
+        {
+            continue;
+        }
         // run naive division on GPU
         gpu_naive_division<<<1024, 256>>>(
             divisors_host[i],
diff --git a/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp b/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp
index a7860955cd..e6b5c918ba 100644
--- a/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp
+++ b/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestgroupnormBwdData : public ::testing::Test
 {
@@ -29,15 +32,20 @@ class TestgroupnormBwdData : public ::testing::Test
                                                          {1, 32, 32, 32, 20},
                                                          {1, 16, 16, 32, 40}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_groupnorm_bwd_data_impl<DYDataType,
                                                                          XDataType,
                                                                          GammaDataType,
                                                                          MeanInvStdDataType,
                                                                          ComputeDataType,
                                                                          DXDataType>(
-                true, 2, false, false, length);
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -49,3 +57,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestgroupnormBwdData, KernelTypes);
 TYPED_TEST(TestgroupnormBwdData, Test_FP32) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp b/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp
index 870f24d064..6786c83938 100644
--- a/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp
+++ b/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2dBwdData : public ::testing::Test
 {
@@ -25,16 +28,21 @@ class TestLayernorm2dBwdData : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
-            bool success =
-                ck::profiler::profile_layernorm_bwd_data_impl<DYDataType,
-                                                              XDataType,
-                                                              GammaDataType,
-                                                              MeanInvStdDataType,
-                                                              ComputeDataType,
-                                                              DXDataType,
-                                                              2>(true, 2, false, false, length);
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
+            bool success = ck::profiler::profile_layernorm_bwd_data_impl<DYDataType,
+                                                                         XDataType,
+                                                                         GammaDataType,
+                                                                         MeanInvStdDataType,
+                                                                         ComputeDataType,
+                                                                         DXDataType,
+                                                                         2>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +54,20 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm2dBwdData, KernelTypes);
 TYPED_TEST(TestLayernorm2dBwdData, Test_FP32) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp b/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
index 53c92413b1..6123efbe8d 100644
--- a/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
+++ b/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2dBwdGammaBeta : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm2dBwdGammaBeta : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_layernorm_bwd_gamma_beta_impl<DYDataType,
                                                                                XDataType,
                                                                                MeanInvStdDataType,
@@ -34,7 +42,7 @@ class TestLayernorm2dBwdGammaBeta : public ::testing::Test
                                                                                DGammaDataType,
                                                                                DBetaDataType,
                                                                                2>(
-                true, 2, false, false, length);
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +54,20 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm2dBwdGammaBeta, KernelTypes);
 TYPED_TEST(TestLayernorm2dBwdGammaBeta, Test_FP32) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp b/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
index c31161fb33..e835668bd6 100644
--- a/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
@@ -31,16 +34,21 @@ class TestGroupnorm : public ::testing::Test
                                                          {2, 32, 32, 32, 40},
                                                          {1, 16, 16, 32, 40}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
-            bool success =
-                ck::profiler::profile_groupnorm_impl<XDataType,
-                                                     GammaDataType,
-                                                     BetaDataType,
-                                                     ComputeDataType,
-                                                     YDataType,
-                                                     SaveMeanInvStdDataType,
-                                                     true>(true, 2, false, false, length);
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
+            bool success = ck::profiler::profile_groupnorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -52,3 +60,20 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
 TYPED_TEST(TestGroupnorm, Test_FP16) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp b/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
index 08d835ed37..fcb9102fac 100644
--- a/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
+++ b/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
@@ -29,16 +32,21 @@ class TestGroupnorm : public ::testing::Test
                                                          {1, 32, 32, 32, 20},
                                                          {1, 16, 16, 32, 40}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
-            bool success =
-                ck::profiler::profile_groupnorm_impl<XDataType,
-                                                     GammaDataType,
-                                                     BetaDataType,
-                                                     ComputeDataType,
-                                                     YDataType,
-                                                     SaveMeanInvStdDataType,
-                                                     true>(true, 2, false, false, length);
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
+            bool success = ck::profiler::profile_groupnorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -50,3 +58,20 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
 TYPED_TEST(TestGroupnorm, Test_FP32) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp b/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
index 3234b2e159..1d8bd560b7 100644
--- a/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm2d : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                 GammaDataType,
                                                                 BetaDataType,
@@ -34,7 +42,8 @@ class TestLayernorm2d : public ::testing::Test
                                                                 YDataType,
                                                                 SaveMeanInvStdDataType,
                                                                 true,
-                                                                2>(true, 2, false, false, length);
+                                                                2>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +55,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
 TYPED_TEST(TestLayernorm2d, Test_FP16) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp b/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
index b46715d96a..10ffeb762e 100644
--- a/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
+++ b/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm2d : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                 GammaDataType,
                                                                 BetaDataType,
@@ -34,7 +42,8 @@ class TestLayernorm2d : public ::testing::Test
                                                                 YDataType,
                                                                 SaveMeanInvStdDataType,
                                                                 true,
-                                                                2>(true, 2, false, false, length);
+                                                                2>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +55,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
 TYPED_TEST(TestLayernorm2d, Test_FP32) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp b/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
index d1a7b9e3df..b7355de96b 100644
--- a/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm4d : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm4d : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {1, 1, 1, 1}, {7, 7, 7, 7}, {256, 16, 16, 8}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                 GammaDataType,
                                                                 BetaDataType,
@@ -34,7 +42,8 @@ class TestLayernorm4d : public ::testing::Test
                                                                 YDataType,
                                                                 SaveMeanInvStdDataType,
                                                                 true,
-                                                                4>(true, 2, false, false, length);
+                                                                4>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +55,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm4d, KernelTypes);
 TYPED_TEST(TestLayernorm4d, Test_FP16) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_avg_pool2d_bwd.cpp b/test/pool/test_avg_pool2d_bwd.cpp
index 0866325fc6..54c75a5553 100644
--- a/test/pool/test_avg_pool2d_bwd.cpp
+++ b/test/pool/test_avg_pool2d_bwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_avg_pool2d_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename T>
 class AvgPool2dBWDTest : public ::testing::Test
 {
@@ -16,8 +19,13 @@ class AvgPool2dBWDTest : public ::testing::Test
 
     void Run()
     {
-        for(auto param : this->params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_avg_pool2d_bwd_impl<InDataType, OutDataType, NHWC, NHWC>(
                     true,
@@ -29,7 +37,8 @@ class AvgPool2dBWDTest : public ::testing::Test
                     param.window_strides_,
                     param.window_dilations_,
                     param.input_left_pads_,
-                    param.input_right_pads_);
+                    param.input_right_pads_,
+                    instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -131,3 +140,20 @@ TYPED_TEST(AvgPool2D_f16, AvgPool2DTest_f16) { this->Run(); }
 TYPED_TEST(AvgPool2D_bf16, AvgPool2DTest_bf16) { this->Run(); }
 
 TYPED_TEST(AvgPool2D_f8, AvgPool2DTest_f8) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_avg_pool2d_fwd.cpp b/test/pool/test_avg_pool2d_fwd.cpp
index b5e733419a..ba78973042 100644
--- a/test/pool/test_avg_pool2d_fwd.cpp
+++ b/test/pool/test_avg_pool2d_fwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_pool2d_fwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestAvgPool2dFwd : public ::testing::Test
 {
@@ -18,8 +21,13 @@ class TestAvgPool2dFwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_pool2d_fwd_impl<InDataType,
                                                       OutDataType,
@@ -38,7 +46,8 @@ class TestAvgPool2dFwd : public ::testing::Test
                                                              param.window_strides_,
                                                              param.window_dilations_,
                                                              param.input_left_pads_,
-                                                             param.input_right_pads_);
+                                                             param.input_right_pads_,
+                                                             instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -143,3 +152,19 @@ TYPED_TEST(AvgPool2D_F16, AvgPool2D_F16_Test) { this->Run(); }
 TYPED_TEST(AvgPool2D_BF16, AvgPool2D_BF16_Test) { this->Run(); }
 TYPED_TEST(AvgPool2D_I8, AvgPool2D_I8_Test) { this->Run(); }
 TYPED_TEST(AvgPool2D_F8, AvgPool2D_F8_Test) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_avg_pool3d_bwd.cpp b/test/pool/test_avg_pool3d_bwd.cpp
index fbd03fdf45..7fa1c4907a 100644
--- a/test/pool/test_avg_pool3d_bwd.cpp
+++ b/test/pool/test_avg_pool3d_bwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_avg_pool3d_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestAvgPool3dBwd : public ::testing::Test
 {
@@ -19,8 +22,13 @@ class TestAvgPool3dBwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_avg_pool3d_bwd_impl<DOutDataType,
                                                           DInDataType,
@@ -35,7 +43,8 @@ class TestAvgPool3dBwd : public ::testing::Test
                                                                      param.window_strides_,
                                                                      param.window_dilations_,
                                                                      param.input_left_pads_,
-                                                                     param.input_right_pads_);
+                                                                     param.input_right_pads_,
+                                                                     instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -72,3 +81,19 @@ TYPED_TEST(TestAvgPool3dBwd, Test_Pool)
 
     this->Run();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_avg_pool3d_fwd.cpp b/test/pool/test_avg_pool3d_fwd.cpp
index 378b05399e..12e83f8e5f 100644
--- a/test/pool/test_avg_pool3d_fwd.cpp
+++ b/test/pool/test_avg_pool3d_fwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_pool3d_fwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestAvgPool3dFwd : public ::testing::Test
 {
@@ -20,8 +23,13 @@ class TestAvgPool3dFwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
                                                             param.window_spatial_lengths_,
                                                             param.window_strides_,
@@ -38,7 +46,8 @@ class TestAvgPool3dFwd : public ::testing::Test
                                                       ck::tensor_layout::convolution::NDHWC,
                                                       ck::ReduceTensorOp::AVG,
                                                       false,
-                                                      false>(in_params_avg_pool, kernel_params);
+                                                      false>(
+                    in_params_avg_pool, kernel_params, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -61,3 +70,19 @@ TYPED_TEST(TestAvgPool3dFwd, Test_Pool)
 
     this->Run();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_max_pool2d_bwd.cpp b/test/pool/test_max_pool2d_bwd.cpp
index 65a897dd5b..e6a53d0d64 100644
--- a/test/pool/test_max_pool2d_bwd.cpp
+++ b/test/pool/test_max_pool2d_bwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_max_pool2d_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename T>
 class MaxPool2dBWDTest : public ::testing::Test
 {
@@ -20,8 +23,13 @@ class MaxPool2dBWDTest : public ::testing::Test
 
     void Run()
     {
-        for(auto param : this->params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_max_pool2d_bwd_impl<InDataType,
                                                           OutDataType,
@@ -37,7 +45,8 @@ class MaxPool2dBWDTest : public ::testing::Test
                                                                  param.window_strides_,
                                                                  param.window_dilations_,
                                                                  param.input_left_pads_,
-                                                                 param.input_right_pads_);
+                                                                 param.input_right_pads_,
+                                                                 instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -137,3 +146,20 @@ TYPED_TEST(MaxPool2D_f16, MaxPool2DTest_f16) { this->Run(); }
 TYPED_TEST(MaxPool2D_bf16, MaxPool2DTest_bf16) { this->Run(); }
 
 TYPED_TEST(MaxPool2D_f8, MaxPool2DTest_f8) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_max_pool2d_fwd.cpp b/test/pool/test_max_pool2d_fwd.cpp
index bb6fc96cb1..4bf2a1cf8d 100644
--- a/test/pool/test_max_pool2d_fwd.cpp
+++ b/test/pool/test_max_pool2d_fwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_pool2d_fwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestMaxPool2dFwd : public ::testing::Test
 {
@@ -19,8 +22,13 @@ class TestMaxPool2dFwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             // max pool
             bool success =
                 ck::profiler::profile_pool2d_fwd_impl<InDataType,
@@ -40,7 +48,8 @@ class TestMaxPool2dFwd : public ::testing::Test
                                                                    param.window_strides_,
                                                                    param.window_dilations_,
                                                                    param.input_left_pads_,
-                                                                   param.input_right_pads_);
+                                                                   param.input_right_pads_,
+                                                                   instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -148,3 +157,20 @@ TYPED_TEST(MaxPool2D_F16, MaxPool2D_F16_Test) { this->Run(); }
 TYPED_TEST(MaxPool2D_BF16, MaxPool2D_BF16_Test) { this->Run(); }
 TYPED_TEST(MaxPool2D_I8, MaxPool2D_I8_Test) { this->Run(); }
 TYPED_TEST(MaxPool2D_F8, MaxPool2D_F8_Test) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_max_pool3d_bwd.cpp b/test/pool/test_max_pool3d_bwd.cpp
index 8d52bde4da..1ae2270272 100644
--- a/test/pool/test_max_pool3d_bwd.cpp
+++ b/test/pool/test_max_pool3d_bwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_max_pool3d_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestMaxPool3dBwd : public ::testing::Test
 {
@@ -20,8 +23,13 @@ class TestMaxPool3dBwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_max_pool3d_bwd_impl<InDataType,
                                                           OutDataType,
@@ -37,7 +45,8 @@ class TestMaxPool3dBwd : public ::testing::Test
                                                                  param.window_strides_,
                                                                  param.window_dilations_,
                                                                  param.input_left_pads_,
-                                                                 param.input_right_pads_);
+                                                                 param.input_right_pads_,
+                                                                 instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -77,3 +86,20 @@ TYPED_TEST(TestMaxPool3dBwd, Test_Pool)
 
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_max_pool3d_fwd.cpp b/test/pool/test_max_pool3d_fwd.cpp
index d7602f9acd..e7f5614d12 100644
--- a/test/pool/test_max_pool3d_fwd.cpp
+++ b/test/pool/test_max_pool3d_fwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_pool3d_fwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestMaxPool3dFwd : public ::testing::Test
 {
@@ -21,8 +24,14 @@ class TestMaxPool3dFwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
+
             ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
                                                             param.window_spatial_lengths_,
                                                             param.window_strides_,
@@ -40,7 +49,8 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                       ck::tensor_layout::convolution::NDHWC,
                                                       ck::ReduceTensorOp::MAX,
                                                       false,
-                                                      false>(in_params_max_pool, kernel_params);
+                                                      false>(
+                    in_params_max_pool, kernel_params, instance_index);
             EXPECT_TRUE(success);
 
             // max pool + index
@@ -52,8 +62,8 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                             ck::tensor_layout::convolution::NDHWC,
                                                             ck::ReduceTensorOp::MAX,
                                                             false,
-                                                            true>(in_params_max_pool_indexed,
-                                                                  kernel_params);
+                                                            true>(
+                in_params_max_pool_indexed, kernel_params, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -76,3 +86,20 @@ TYPED_TEST(TestMaxPool3dFwd, Test_Pool)
 
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/quantization/CMakeLists.txt b/test/quantization/CMakeLists.txt
new file mode 100644
index 0000000000..89a99f5e5d
--- /dev/null
+++ b/test/quantization/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_custom_target(test_quantization)
+add_subdirectory(gemm)
diff --git a/test/quantization/gemm/CMakeLists.txt b/test/quantization/gemm/CMakeLists.txt
new file mode 100644
index 0000000000..630e6e09c9
--- /dev/null
+++ b/test/quantization/gemm/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_custom_target(test_gemm_quantization_targets)
+
+add_gtest_executable(test_gemm_quantization test_gemm_quantization.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_quantization PRIVATE utility device_quantization_instance)
+    add_dependencies(test_gemm_quantization_targets test_gemm_quantization)
+endif()
+
+add_dependencies(test_quantization test_gemm_quantization_targets)
diff --git a/test/quantization/gemm/test_gemm_quantization.cpp b/test/quantization/gemm/test_gemm_quantization.cpp
new file mode 100644
index 0000000000..9981ae8a41
--- /dev/null
+++ b/test/quantization/gemm/test_gemm_quantization.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "profiler/profile_gemm_quantization_impl.hpp"
+#include "test_gemm_quantization_util.hpp"
+
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <typename Tuple>
+class TestGemmQuantization : public ck::test::TestGemmQuantizationCommon<Tuple>
+{
+    protected:
+    using ProfileCall = bool (*const)(int, int, bool, bool, int, int, int, int, int, int, float);
+
+    ProfileCall GetImpl() override
+    {
+        return &ck::profiler::profile_gemm_quantization_impl<
+            typename ck::test::TestGemmQuantizationCommon<Tuple>::ADataType,
+            typename ck::test::TestGemmQuantizationCommon<Tuple>::BDataType,
+            typename ck::test::TestGemmQuantizationCommon<Tuple>::AccDataType,
+            typename ck::test::TestGemmQuantizationCommon<Tuple>::EDataType,
+            typename ck::test::TestGemmQuantizationCommon<Tuple>::ALayout,
+            typename ck::test::TestGemmQuantizationCommon<Tuple>::BLayout,
+            typename ck::test::TestGemmQuantizationCommon<Tuple>::ELayout>;
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<I8, I8, I32, I8, Row, Row, Row>,
+                                     std::tuple<I8, I8, I32, I8, Row, Col, Row>,
+                                     std::tuple<I8, I8, I32, I8, Col, Row, Row>,
+                                     std::tuple<I8, I8, I32, I8, Col, Col, Row>>;
+
+TYPED_TEST_SUITE(TestGemmQuantization, KernelTypes);
+
+#include "test_gemm_quantization_ut_cases.inc"
diff --git a/test/quantization/gemm/test_gemm_quantization_ut_cases.inc b/test/quantization/gemm/test_gemm_quantization_ut_cases.inc
new file mode 100644
index 0000000000..83a13e4a85
--- /dev/null
+++ b/test/quantization/gemm/test_gemm_quantization_ut_cases.inc
@@ -0,0 +1,41 @@
+#pragma once
+
+TYPED_TEST(TestGemmQuantization, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    for(int M : Ms)
+        this->Run({{M, N, K}});
+}
+
+TYPED_TEST(TestGemmQuantization, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 1024;
+    constexpr int K = 320;
+
+    for(int M : Ms)
+        this->Run({{M, N, K}});
+}
+
+TYPED_TEST(TestGemmQuantization, MNKPadded)
+{
+    const std::vector<int> Ms{127, 150, 188, 210};
+    constexpr int N = 136;
+    constexpr int K = 280;
+
+    for(int M : Ms)
+        this->Run({{M, N, K}});
+}
+
+TYPED_TEST(TestGemmQuantization, Regular)
+{
+    constexpr int M = 512;
+    constexpr int N = 512;
+    std::vector<int> Ks{512};
+
+    for(int K : Ks)
+        this->Run({{M, N, K}});
+}
diff --git a/test/quantization/gemm/test_gemm_quantization_util.hpp b/test/quantization/gemm/test_gemm_quantization_util.hpp
new file mode 100644
index 0000000000..e1ca0de2db
--- /dev/null
+++ b/test/quantization/gemm/test_gemm_quantization_util.hpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/data_type.hpp"
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using I8  = int8_t;
+using I32 = int32_t;
+
+namespace ck {
+namespace test {
+
+using TestMatrixSizes = std::vector<std::vector<ck::index_t>>;
+
+static const TestMatrixSizes DefaultTestMatrixSizes = {
+    {16, 32, 64}, {512, 2048, 4096}, {2048, 1024, 16}};
+
+template <typename Tuple>
+class TestGemmQuantizationCommon : public ::testing::Test
+{
+    protected:
+    using ADataType   = std::tuple_element_t<0, Tuple>;
+    using BDataType   = std::tuple_element_t<1, Tuple>;
+    using AccDataType = std::tuple_element_t<2, Tuple>;
+    using EDataType   = std::tuple_element_t<3, Tuple>;
+    using ALayout     = std::tuple_element_t<4, Tuple>;
+    using BLayout     = std::tuple_element_t<5, Tuple>;
+    using ELayout     = std::tuple_element_t<6, Tuple>;
+
+    using ProfileCall = bool (*const)(int, int, bool, bool, int, int, int, int, int, int, float);
+
+    virtual ProfileCall GetImpl() = 0;
+
+    void Run(const TestMatrixSizes& lengths = DefaultTestMatrixSizes)
+    {
+        bool all_success = true;
+
+        for(auto length : lengths)
+        {
+            int M               = length[0];
+            int N               = length[1];
+            int K               = length[2];
+            int StrideA         = ck::is_same_v<ALayout, Row> ? K : M;
+            int StrideB         = ck::is_same_v<BLayout, Row> ? N : K;
+            int StrideE         = ck::is_same_v<ELayout, Row> ? N : M;
+            float requant_scale = 0.03f;
+
+            all_success =
+                all_success &
+                GetImpl()(1, 1, false, true, M, N, K, StrideA, StrideB, StrideE, requant_scale);
+        }
+
+        EXPECT_TRUE(all_success);
+    }
+};
+
+} // namespace test
+} // namespace ck
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
index 9153805889..20f7ee3d57 100644
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -8,6 +8,9 @@
 #include <gtest/gtest.h>
 using namespace ck;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 struct ReduceParam
 {
     bool do_verification{true};
@@ -53,8 +56,13 @@ class ReduceWithIndexTest : public ::testing::Test
     template <ReduceTensorOp ReduceOpIdType>
     void Run()
     {
-        for(auto param : this->params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param  = this->params[i];
             bool success = ck::profiler::profile_reduce_impl<InDataType, AccDataType, OutDataType>(
                 param.do_verification,
                 param.init_method,
@@ -66,7 +74,8 @@ class ReduceWithIndexTest : public ::testing::Test
                 param.propagateNan,
                 param.useIndex,
                 param.alpha,
-                param.beta);
+                param.beta,
+                instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -201,3 +210,20 @@ TYPED_TEST(ReduceWithNoIndexBHalfFloat, ReduceWithNoIndexTestBHalfFloat_MAX)
     // trigger Run() -> Generic
     this->template Run<ReduceTensorOp::MAX>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
index 796211a69a..fa539b4026 100644
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -8,6 +8,9 @@
 #include <gtest/gtest.h>
 using namespace ck;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 struct ReduceParam
 {
     bool do_verification{true};
@@ -53,8 +56,13 @@ class ReduceWithIndexTest : public ::testing::Test
     template <ReduceTensorOp ReduceOpIdType>
     void Run()
     {
-        for(auto param : this->params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param  = this->params[i];
             bool success = ck::profiler::profile_reduce_impl<InDataType, AccDataType, OutDataType>(
                 param.do_verification,
                 param.init_method,
@@ -66,7 +74,8 @@ class ReduceWithIndexTest : public ::testing::Test
                 param.propagateNan,
                 param.useIndex,
                 param.alpha,
-                param.beta);
+                param.beta,
+                instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -201,3 +210,20 @@ TYPED_TEST(ReduceWithIndexBHalfFloat, ReduceWithIndexTestBHalfFloat_MAX)
     // trigger Run() -> Generic
     this->template Run<ReduceTensorOp::MAX>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/softmax/test_softmax_ut_cases.inc b/test/softmax/test_softmax_ut_cases.inc
index cf5e4d2d2d..46154eb445 100644
--- a/test/softmax/test_softmax_ut_cases.inc
+++ b/test/softmax/test_softmax_ut_cases.inc
@@ -58,3 +58,20 @@ TYPED_TEST(TestSoftmax, ReduceOddLengths)
     this->Run({this->Rank - 1});
     this->Run({this->Rank - 2});
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp
index 1409af8453..96c8fe588d 100644
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -15,6 +15,9 @@
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_softmax_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 namespace ck {
 
 template <typename Range>
@@ -56,7 +59,8 @@ class TestSoftmax : public ::testing::Test
     void RunSingle(std::vector<index_t> in_length,
                    std::vector<index_t> reduce_dims,
                    AccDataType alpha,
-                   AccDataType beta)
+                   AccDataType beta,
+                   index_t instance_index)
     {
         int init_method = 1; // integer value initialization
         bool log        = false;
@@ -67,84 +71,98 @@ class TestSoftmax : public ::testing::Test
         {
             if(reduce_dims.size() == 1)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 2)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 3)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
         }
         else if constexpr(Rank == 4)
         {
             if(reduce_dims.size() == 1)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 2)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 3)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 4)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
         };
 
         EXPECT_TRUE(pass);
@@ -161,7 +179,7 @@ class TestSoftmax : public ::testing::Test
         {
             for(auto scale : this->scales_)
             {
-                this->RunSingle(in_length, reduce_dims, scale[0], scale[1]);
+                this->RunSingle(in_length, reduce_dims, scale[0], scale[1], instance_index);
             }
         }
     }
diff --git a/test/wrapper/test_wrapper_gemm_xdl.cpp b/test/wrapper/test_wrapper_gemm_xdl.cpp
index fd2cb7d4f3..11430d49af 100644
--- a/test/wrapper/test_wrapper_gemm_xdl.cpp
+++ b/test/wrapper/test_wrapper_gemm_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <numeric>
 #include <cstdlib>
@@ -81,6 +81,7 @@ __global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
                                                         const BlockShape tile_shape,
                                                         const ThreadLayout thread_layout)
 {
+#if defined(__gfx9__)
     constexpr auto MPerBlock  = ck::wrapper::size<0>(tile_shape);
     constexpr auto NPerBlock  = ck::wrapper::size<1>(tile_shape);
     constexpr auto KPerBlock  = ck::wrapper::size<2>(tile_shape);
@@ -256,6 +257,16 @@ __global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
         a_lds_tensor, b_lds_tensor, c_vgpr_reg);
 
     ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
+#else
+    ck::ignore = p_a;
+    ck::ignore = p_b;
+    ck::ignore = p_c;
+    ck::ignore = M;
+    ck::ignore = N;
+    ck::ignore = K;
+    ck::ignore = tile_shape;
+    ck::ignore = thread_layout;
+#endif
 }
 
 template <typename DataType,
@@ -374,3 +385,14 @@ TEST(TestGemm, Float_2x4_4x2_XdlPerWave)
     PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_4K1, 4, false>(
         512, 512, 128, tile_shape, thread_layout);
 }
+
+int main(int argc, char** argv)
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        std::cout << "This test support gfx9 only" << std::endl;
+        return 0;
+    }
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test_data/generate_model_configs.py b/test_data/generate_model_configs.py
index 125655cef4..f852d781d6 100644
--- a/test_data/generate_model_configs.py
+++ b/test_data/generate_model_configs.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Generate Model Configuration Combinations for MIOpen Testing
 
diff --git a/test_data/generate_test_dataset.sh b/test_data/generate_test_dataset.sh
index 3fb8fa027b..1124311feb 100755
--- a/test_data/generate_test_dataset.sh
+++ b/test_data/generate_test_dataset.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 # Generate Comprehensive Convolution Test Dataset for CK
 # This script captures MIOpen commands from PyTorch models and generates test cases
 
diff --git a/test_data/miopen_to_csv.py b/test_data/miopen_to_csv.py
index ae8c187b43..3292584548 100644
--- a/test_data/miopen_to_csv.py
+++ b/test_data/miopen_to_csv.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Convert MIOpen Driver Commands to CSV Test Cases
 
diff --git a/test_data/run_model_with_miopen.py b/test_data/run_model_with_miopen.py
index 3d96e19f2f..596f6a4a37 100644
--- a/test_data/run_model_with_miopen.py
+++ b/test_data/run_model_with_miopen.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 PyTorch Model Runner with MIOpen Command Logging using torchvision models
 
diff --git a/tile_engine/ops/CMakeLists.txt b/tile_engine/ops/CMakeLists.txt
index 7d7002af1b..db100553f3 100644
--- a/tile_engine/ops/CMakeLists.txt
+++ b/tile_engine/ops/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(gemm)
-add_subdirectory(gemm_multi_d)
\ No newline at end of file
+add_subdirectory(gemm_multi_d)
+add_subdirectory(gemm_preshuffle)
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index d52351af2d..77165ae0fa 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -13,38 +13,38 @@ function(create_individual_gemm_target datatype layout trait tile_config config_
         message(WARNING "Skipping individual GEMM target ${datatype}_${layout}_${trait}_${tile_config}: No supported GPU targets")
         return()
     endif()
-    
+
     # Parse tile configuration: format is tile_mxtile_nxtile_k_warp_mxwarp_nxwarp_k_warp_tile_mxwarp_tile_nxwarp_tile_k
     # First split by underscore to get three groups
     string(REPLACE "_" ";" config_groups ${tile_config})
     list(GET config_groups 0 tile_dims)      # e.g., 256x256x32
     list(GET config_groups 1 warp_dims)      # e.g., 4x1x1
     list(GET config_groups 2 warp_tile_dims) # e.g., 16x16x16
-    
+
     # Parse tile dimensions
     string(REPLACE "x" ";" tile_parts ${tile_dims})
     list(GET tile_parts 0 tile_m)
     list(GET tile_parts 1 tile_n)
     list(GET tile_parts 2 tile_k)
-    
+
     # Parse warp dimensions
     string(REPLACE "x" ";" warp_parts ${warp_dims})
     list(GET warp_parts 0 warp_m)
     list(GET warp_parts 1 warp_n)
     list(GET warp_parts 2 warp_k)
-    
+
     # Parse warp tile dimensions
     string(REPLACE "x" ";" warp_tile_parts ${warp_tile_dims})
     list(GET warp_tile_parts 0 warp_tile_m)
     list(GET warp_tile_parts 1 warp_tile_n)
     list(GET warp_tile_parts 2 warp_tile_k)
-    
+
     set(target_name "benchmark_gemm_${datatype}_${layout}_${trait}_${tile_config}")
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
-    
+
     # Generate the single instance header for this kernel
     set(instance_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
-    
+
     # Add custom command to generate the header file at build time
     add_custom_command(
         OUTPUT ${instance_header}
@@ -60,27 +60,27 @@ function(create_individual_gemm_target datatype layout trait tile_config config_
         DEPENDS ${GEMM_SOURCE_DIR}/gemm_instance_builder.py ${config_json}
         COMMENT "Generating ${instance_header}"
     )
-    
+
     # Create the executable
-    add_executable(${target_name} 
+    add_executable(${target_name}
         ${GEMM_SOURCE_DIR}/benchmark_gemm_single.cpp
         ${instance_header}
     )
-    
+
     # Set GPU architectures
     set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS_INDIVIDUAL})
-    
+
     # Set compile definitions
     target_compile_definitions(${target_name} PRIVATE
         GEMM_SINGLE_INSTANCE_HPP="${instance_header}"
     )
-    
+
     # Include directories
     target_include_directories(${target_name} PRIVATE
         ${GEMM_SOURCE_DIR}
         ${working_path}
     )
-    
+
     # Compile options
     target_compile_options(${target_name} PRIVATE
         -Wno-undefined-func-template
@@ -88,19 +88,19 @@ function(create_individual_gemm_target datatype layout trait tile_config config_
         --offload-compress
         -include ${instance_header}
     )
-    
+
     # Add to collection targets
     add_dependencies(benchmark_gemm_all ${target_name})
     add_dependencies(benchmark_gemm_${datatype} ${target_name})
     add_dependencies(benchmark_gemm_${layout} ${target_name})
     add_dependencies(benchmark_gemm_${datatype}_${layout} ${target_name})
-    
+
     # Add to trait-specific targets
     string(REPLACE "_" ";" trait_parts ${trait})
     list(GET trait_parts 0 pipeline)
     list(GET trait_parts 1 epilogue)
     list(GET trait_parts 2 scheduler)
-    
+
     add_dependencies(benchmark_gemm_${pipeline} ${target_name})
     add_dependencies(benchmark_gemm_${epilogue} ${target_name})
     add_dependencies(benchmark_gemm_${scheduler} ${target_name})
@@ -109,13 +109,13 @@ endfunction()
 # Function to build individual GEMM targets
 function(build_individual_gemm_targets datatype layout)
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
-    
+
     # Choose config file
     # Priority order:
     # 1. Environment variable GEMM_CONFIG_FILE
-    # 2. CMake variable GEMM_CONFIG_FILE  
+    # 2. CMake variable GEMM_CONFIG_FILE
     # 3. Default based on layout
-    
+
     # Check environment variable first
     if(DEFINED ENV{GEMM_CONFIG_FILE} AND NOT "$ENV{GEMM_CONFIG_FILE}" STREQUAL "")
         set(config_filename "$ENV{GEMM_CONFIG_FILE}")
@@ -130,12 +130,12 @@ function(build_individual_gemm_targets datatype layout)
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
         message(STATUS "  Using default config for layout ${layout}")
     endif()
-    
+
     # Check if config file exists
     if(NOT EXISTS ${json_blob})
         message(FATAL_ERROR "Config file not found: ${json_blob}")
     endif()
-    
+
     # Determine number of workers for parallel generation
     if(DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
         set(num_workers $ENV{CMAKE_BUILD_PARALLEL_LEVEL})
@@ -147,17 +147,24 @@ function(build_individual_gemm_targets datatype layout)
             set(num_workers 8)
         endif()
     endif()
-    
+
     # Generate individual kernel files using parallel version
     message(STATUS "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
     message(STATUS "  Working path: ${working_path}")
     message(STATUS "  Config file: ${json_blob}")
     message(STATUS "  Python executable: ${Python3_EXECUTABLE}")
     message(STATUS "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py")
-    
+
     # Create working directory first
     file(MAKE_DIRECTORY ${working_path})
-    
+
+    message(STATUS "COMMAND: ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --layout ${layout}
+                --config_json ${json_blob}
+                --list_kernels")
+
     # First, just list the kernels (fast operation)
     message(STATUS "  Listing kernel configurations...")
     execute_process(
@@ -172,11 +179,11 @@ function(build_individual_gemm_targets datatype layout)
         OUTPUT_VARIABLE list_output
         ERROR_VARIABLE list_error
     )
-    
+
     if(NOT ret EQUAL 0)
         message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${list_error}")
     endif()
-    
+
     # Read kernel count
     if(EXISTS ${working_path}/gemm_kernel_count.txt)
         file(READ ${working_path}/gemm_kernel_count.txt kernel_count)
@@ -185,7 +192,7 @@ function(build_individual_gemm_targets datatype layout)
     else()
         message(FATAL_ERROR "Kernel count file not found")
     endif()
-    
+
     # Read kernel list and create targets
     if(EXISTS ${working_path}/gemm_kernel_list.txt)
         file(STRINGS ${working_path}/gemm_kernel_list.txt kernel_lines)
@@ -195,7 +202,7 @@ function(build_individual_gemm_targets datatype layout)
             list(GET parts 0 kernel_name)
             list(GET parts 1 tile_config)
             list(GET parts 2 trait_combo)
-            
+
             # Create individual target
             create_individual_gemm_target("${datatype}" "${layout}" "${trait_combo}" "${tile_config}" "${json_blob}")
         endforeach()
@@ -210,9 +217,9 @@ message(STATUS "GEMM_DATATYPE: ${GEMM_DATATYPE}")
 message(STATUS "GEMM_LAYOUT: ${GEMM_LAYOUT}")
 message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
-# Filter GPU targets to only gfx90a, gfx942, and gfx950
+# Filter GPU targets to only gfx90a, gfx942, gfx950, gfx1201
 set(GEMM_GPU_TARGETS_INDIVIDUAL "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -223,13 +230,13 @@ endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
     message(STATUS "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")
 
     # Enable parallel compilation optimizations
     # Set up job pools for better parallel compilation control
-    set_property(GLOBAL PROPERTY JOB_POOLS 
+    set_property(GLOBAL PROPERTY JOB_POOLS
         compile_heavy=4    # Limit heavy compilations to prevent OOM
         compile_normal=16  # Allow more parallel normal compilations
     )
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index 6a87193043..98595933b8 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -179,6 +179,11 @@ warp_tile_supported_combinations = {
             [32, 32, 64],
         ],
     },
+    "gfx1201": {
+        "fp16_fp16_fp16": [
+            [16, 16, 16],
+        ],
+    },
 }
 
 # To Do: remove some unsupported combinations
diff --git a/tile_engine/ops/gemm/configs/gfx120x_config.json b/tile_engine/ops/gemm/configs/gfx120x_config.json
new file mode 100644
index 0000000000..6c4a5d0ec0
--- /dev/null
+++ b/tile_engine/ops/gemm/configs/gfx120x_config.json
@@ -0,0 +1,102 @@
+{
+    "problem": {
+    },
+    "tile_config": {
+        "tile_m": {
+            "values": [
+                256,
+                128,
+                64
+            ]
+        },
+        "tile_n": {
+            "values": [
+                256,
+                128,
+                64
+            ]
+        },
+        "tile_k": {
+            "values": [
+                256,
+                128,
+                64
+            ]
+        },
+        "warp_m": {
+            "values": [
+                4,
+                2,
+                1
+            ]
+        },
+        "warp_n": {
+            "values": [
+                4,
+                2,
+                1
+            ]
+        },
+        "warp_k": {
+            "values": [
+                1
+            ]
+        },
+        "warp_tile_m": {
+            "values": [
+                16
+            ]
+        },
+        "warp_tile_n": {
+            "values": [
+                16
+            ]
+        },
+        "warp_tile_k": {
+            "values": [
+                16
+            ]
+        }
+    },
+    "trait_config": {
+        "pipeline": {
+            "values": [
+                "compv3",
+                "mem"
+            ]
+        },
+        "scheduler": {
+            "values": [
+                "intrawave",
+                "interwave"
+            ]
+        },
+        "epilogue": {
+            "values": [
+                "cshuffle",
+                "default"
+            ]
+        },
+        "pad_m": {
+            "values": [
+                false
+            ]
+        },
+        "pad_n": {
+            "values": [
+                false
+            ]
+        },
+        "pad_k": {
+            "values": [
+                false
+            ]
+        },
+        "persistent": {
+            "values": [
+                false,
+                true
+            ]
+        }
+    }
+}
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index d679be7b84..c2214da613 100644
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -8,6 +8,7 @@ import multiprocessing
 import concurrent.futures
 from pathlib import Path
 import logging
+from typing import Optional
 from validation_utils import is_tile_config_valid, is_trait_combination_valid
 
 logging.basicConfig(level=logging.INFO)
@@ -325,7 +326,7 @@ class GemmKernelBuilder:
         "c": "ck_tile::tensor_layout::gemm::ColumnMajor",
     }
 
-    def _get_abc_layouts(self, layout_code: str | None = None):
+    def _get_abc_layouts(self, layout_code: Optional[str] = None):
         """
         Return (ALayout, BLayout, CLayout) from a 3-letter code like 'rcr', 'ccr', 'crr', 'rrr'.
         If layout_code is None, use self.layout.
diff --git a/tile_engine/ops/gemm/validation_utils.py b/tile_engine/ops/gemm/validation_utils.py
index 4948fd5744..c0e109bf11 100644
--- a/tile_engine/ops/gemm/validation_utils.py
+++ b/tile_engine/ops/gemm/validation_utils.py
@@ -11,6 +11,7 @@ import subprocess
 import re
 from functools import lru_cache
 import logging
+from typing import Tuple, List
 
 # Element size mapping for different data types
 ELEMENT_SIZE_MAP = {
@@ -102,6 +103,36 @@ WARP_TILE_SUPPORTED_COMBINATIONS = {
             [32, 32, 64],
         ],
     },
+    "gfx1201": {
+        "fp16_fp16_fp16": [
+            [16, 16, 16],
+        ],
+    },    
+}
+
+# Supported warp tile combinations for different GPU architectures and data types
+WARP_SUPPORTED_COMBINATIONS = {
+    "gfx90a": [
+        [1, 4, 1], 
+        [2, 2, 1], 
+        [4, 1, 1],
+    ],
+    "gfx942": [
+        [1, 4, 1], 
+        [2, 2, 1], 
+        [4, 1, 1],
+    ],
+    "gfx950": [
+        [1, 4, 1], 
+        [2, 2, 1], 
+        [4, 1, 1],
+    ],
+    "gfx1201": [
+        [2, 4, 1], 
+        [1, 8, 1], 
+        [8, 1, 1], 
+        [4, 2, 1],
+    ],    
 }
 
 # Unsupported trait combinations
@@ -154,9 +185,32 @@ def is_trait_combination_valid(pipeline: str, epilogue: str, scheduler: str) ->
     return (pipeline, epilogue, scheduler) not in TRAIT_UNSUPPORTED_COMBINATIONS
 
 
-def validate_warp_configuration(warp_m: int, warp_n: int, warp_k: int) -> bool:
+def validate_warp_configuration(
+    warp_m: int, 
+    warp_n: int, 
+    warp_k: int,
+    gpu_name: str = None,
+) -> bool:
     """Validate warp configuration."""
-    return (warp_m, warp_n, warp_k) in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]
+    if gpu_name is None:
+        gpu_name = get_gpu_name_by_id(0)    
+
+    current_combination = [warp_m, warp_n, warp_k]
+
+    allowed_combinations = WARP_SUPPORTED_COMBINATIONS.get(gpu_name, {})
+    if not allowed_combinations:
+        # If GPU not recognized, try to be permissive but log warning
+        logging.warning(f"No warp_[m/n/k] combinations found for GPU: {gpu_name}")
+        return True
+
+    # Check if current combination is in the allowed list
+    if current_combination not in allowed_combinations:
+        error_msg = (
+            f"Invalid warp tile combination: {current_combination} not in allowed list. "
+        )
+        return False
+                
+    return True
 
 
 def validate_dimension_alignment(
@@ -169,7 +223,7 @@ def validate_dimension_alignment(
     warp_tile_m: int,
     warp_tile_n: int,
     warp_tile_k: int,
-) -> tuple[bool, list[str]]:
+) -> Tuple[bool, List[str]]:
     """Check if tile dimensions are properly aligned with warp dimensions."""
     alignment_issues = []
 
@@ -196,7 +250,7 @@ def validate_lds_capacity(
     a_datatype: str,
     b_datatype: str,
     pipeline: str,
-) -> tuple[bool, str]:
+) -> Tuple[bool, str]:
     """Validate LDS capacity requirements."""
     matrix_a_size = (tile_m * tile_k) * element_size(a_datatype)
     matrix_b_size = (tile_n * tile_k) * element_size(b_datatype)
@@ -224,7 +278,7 @@ def validate_warp_tile_combination(
     b_datatype: str,
     c_datatype: str,
     gpu_name: str = None,
-) -> tuple[bool, str]:
+) -> Tuple[bool, str]:
     """Validate warp tile combination against GPU-specific supported combinations."""
     if gpu_name is None:
         gpu_name = get_gpu_name_by_id(0)
diff --git a/tile_engine/ops/gemm_preshuffle/CMakeLists.txt b/tile_engine/ops/gemm_preshuffle/CMakeLists.txt
new file mode 100644
index 0000000000..2b8f5914f5
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/CMakeLists.txt
@@ -0,0 +1,296 @@
+set(GEMM_PRESHUFFLE_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM Preshuffle (semicolon-separated)")
+set(GEMM_PRESHUFFLE_LAYOUT "rcr" CACHE STRING "List of layout for GEMM Preshuffle (semicolon-separated)")
+set(GEMM_PRESHUFFLE_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
+option(ENABLE_CCACHE_GEMM_PRESHUFFLE "Enable ccache for GEMM Preshuffle ops compilation" OFF)
+
+# Store the directory path for use in functions
+set(GEMM_PRESHUFFLE_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR})
+
+# Function to create individual GEMM Preshuffle targets
+function(create_individual_gemm_preshuffle_target datatype layout trait tile_config config_json)
+    # Use the parent scope GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL variable
+    if(NOT GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL)
+        message(WARNING "Skipping individual GEMM Preshuffle target ${datatype}_${layout}_${trait}_${tile_config}: No supported GPU targets")
+        return()
+    endif()
+    
+    # Parse tile configuration: format is tile_mxtile_nxtile_k_warp_mxwarp_nxwarp_k_warp_tile_mxwarp_tile_nxwarp_tile_k
+    # First split by underscore to get three groups
+    string(REPLACE "_" ";" config_groups ${tile_config})
+    list(GET config_groups 0 tile_dims)      # e.g., 256x256x32
+    list(GET config_groups 1 warp_dims)      # e.g., 4x1x1
+    list(GET config_groups 2 warp_tile_dims) # e.g., 16x16x16
+    
+    # Parse tile dimensions
+    string(REPLACE "x" ";" tile_parts ${tile_dims})
+    list(GET tile_parts 0 tile_m)
+    list(GET tile_parts 1 tile_n)
+    list(GET tile_parts 2 tile_k)
+    
+    # Parse warp dimensions
+    string(REPLACE "x" ";" warp_parts ${warp_dims})
+    list(GET warp_parts 0 warp_m)
+    list(GET warp_parts 1 warp_n)
+    list(GET warp_parts 2 warp_k)
+    
+    # Parse warp tile dimensions
+    string(REPLACE "x" ";" warp_tile_parts ${warp_tile_dims})
+    list(GET warp_tile_parts 0 warp_tile_m)
+    list(GET warp_tile_parts 1 warp_tile_n)
+    list(GET warp_tile_parts 2 warp_tile_k)
+    
+    set(target_name "benchmark_gemm_preshuffle_${datatype}_${layout}_${trait}_${tile_config}")
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
+    
+    # Generate the single instance header for this kernel
+    set(instance_header "${working_path}/gemm_preshuffle_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
+    
+    # Add custom command to generate the header file at build time
+    add_custom_command(
+        OUTPUT ${instance_header}
+        COMMAND ${Python3_EXECUTABLE} ${GEMM_PRESHUFFLE_SOURCE_DIR}/gemm_preshuffle_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --layout ${layout}
+                --config_json ${config_json}
+                --gen_single
+                --kernel_name "gemm_preshuffle_${datatype}_${layout}_${trait}_${tile_config}"
+                --tile_config "${tile_config}"
+                --trait_combo "${trait}"
+        DEPENDS ${GEMM_PRESHUFFLE_SOURCE_DIR}/gemm_preshuffle_instance_builder.py ${config_json}
+        COMMENT "Generating ${instance_header}"
+    )
+    
+    # Create the executable
+    add_executable(${target_name} 
+        ${GEMM_PRESHUFFLE_SOURCE_DIR}/benchmark_gemm_preshuffle_single.cpp
+        ${instance_header}
+    )
+    
+    # Set GPU architectures
+    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL})
+    
+    # Set compile definitions
+    target_compile_definitions(${target_name} PRIVATE
+        GEMM_PRESHUFFLE_SINGLE_INSTANCE_HPP="${instance_header}"
+    )
+    
+    # Include directories
+    target_include_directories(${target_name} PRIVATE
+        ${GEMM_PRESHUFFLE_SOURCE_DIR}
+        ${working_path}
+    )
+    
+    # Compile options
+    target_compile_options(${target_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+        -include ${instance_header}
+    )
+    
+    # Add to collection targets
+    add_dependencies(benchmark_gemm_preshuffle_all ${target_name})
+    add_dependencies(benchmark_gemm_preshuffle_${datatype} ${target_name})
+    add_dependencies(benchmark_gemm_preshuffle_${layout} ${target_name})
+    add_dependencies(benchmark_gemm_preshuffle_${datatype}_${layout} ${target_name})
+    
+    # Add to trait-specific targets
+    string(REPLACE "_" ";" trait_parts ${trait})
+    list(GET trait_parts 0 pipeline)
+    list(GET trait_parts 1 epilogue)
+    list(GET trait_parts 2 scheduler)
+    
+    add_dependencies(benchmark_gemm_preshuffle_${pipeline}_pipeline ${target_name})
+    add_dependencies(benchmark_gemm_preshuffle_${epilogue}_epilogue ${target_name})
+    add_dependencies(benchmark_gemm_preshuffle_${scheduler}_scheduler ${target_name})
+endfunction()
+
+# Function to build individual GEMM Preshuffle targets
+function(build_individual_gemm_preshuffle_targets datatype layout)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
+    
+    # Choose config file
+    # Priority order:
+    # 1. Environment variable GEMM_PRESHUFFLE_CONFIG_FILE
+    # 2. CMake variable GEMM_PRESHUFFLE_CONFIG_FILE  
+    # 3. Default based on layout
+    
+    # Check environment variable first
+    if(DEFINED ENV{GEMM_PRESHUFFLE_CONFIG_FILE} AND NOT "$ENV{GEMM_PRESHUFFLE_CONFIG_FILE}" STREQUAL "")
+        set(config_filename "$ENV{GEMM_PRESHUFFLE_CONFIG_FILE}")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
+        message(STATUS "  Using config from environment variable: ${config_filename}")
+    elseif(NOT "${GEMM_PRESHUFFLE_CONFIG_FILE}" STREQUAL "")
+        # Use CMake variable if set
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_PRESHUFFLE_CONFIG_FILE}")
+        message(STATUS "  Using custom config: ${GEMM_PRESHUFFLE_CONFIG_FILE}")
+    else()
+        # Use default config for all layouts
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+        message(STATUS "  Using default config for layout ${layout}")
+    endif()
+    
+    # Check if config file exists
+    if(NOT EXISTS ${json_blob})
+        message(FATAL_ERROR "Config file not found: ${json_blob}")
+    endif()
+    
+    # Determine number of workers for parallel generation
+    if(DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+        set(num_workers $ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+    else()
+        # Use processor count but limit to avoid memory issues
+        cmake_host_system_information(RESULT num_cores QUERY NUMBER_OF_LOGICAL_CORES)
+        math(EXPR num_workers "${num_cores}")
+        if(num_workers GREATER 8)
+            set(num_workers 8)
+        endif()
+    endif()
+    
+    # Generate individual kernel files using parallel version
+    message(STATUS "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
+    message(STATUS "  Working path: ${working_path}")
+    message(STATUS "  Config file: ${json_blob}")
+    message(STATUS "  Python executable: ${Python3_EXECUTABLE}")
+    message(STATUS "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_preshuffle_instance_builder.py")
+    
+    # Create working directory first
+    file(MAKE_DIRECTORY ${working_path})
+    
+    # First, just list the kernels (fast operation)
+    message(STATUS "  Listing kernel configurations...")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_preshuffle_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --layout ${layout}
+                --config_json ${json_blob}
+                --list_kernels
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        RESULT_VARIABLE ret
+        OUTPUT_VARIABLE list_output
+        ERROR_VARIABLE list_error
+    )
+    
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${list_error}")
+    endif()
+    
+    # Read kernel count
+    if(EXISTS ${working_path}/gemm_preshuffle_kernel_count.txt)
+        file(READ ${working_path}/gemm_preshuffle_kernel_count.txt kernel_count)
+        string(STRIP "${kernel_count}" kernel_count)
+        message(STATUS "  Found ${kernel_count} kernel configurations")
+    else()
+        message(FATAL_ERROR "Kernel count file not found")
+    endif()
+    
+    # Read kernel list and create targets
+    if(EXISTS ${working_path}/gemm_preshuffle_kernel_list.txt)
+        file(STRINGS ${working_path}/gemm_preshuffle_kernel_list.txt kernel_lines)
+        foreach(line IN LISTS kernel_lines)
+            # Parse line: kernel_name|tile_config|trait_combo
+            string(REPLACE "|" ";" parts "${line}")
+            list(GET parts 0 kernel_name)
+            list(GET parts 1 tile_config)
+            list(GET parts 2 trait_combo)
+            
+            # Create individual target
+            create_individual_gemm_preshuffle_target("${datatype}" "${layout}" "${trait_combo}" "${tile_config}" "${json_blob}")
+        endforeach()
+    else()
+        message(FATAL_ERROR "Kernel list file not found")
+    endif()
+endfunction()
+
+# Main build logic - Only individual builds supported
+message(STATUS "=== Starting Tile Engine GEMM Preshuffle Configuration ===")
+message(STATUS "GEMM_PRESHUFFLE_DATATYPE: ${GEMM_PRESHUFFLE_DATATYPE}")
+message(STATUS "GEMM_PRESHUFFLE_LAYOUT: ${GEMM_PRESHUFFLE_LAYOUT}")
+message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+
+# Filter GPU targets to only gfx90a, gfx942, and gfx950
+set(GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL "")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
+
+foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+    if(target IN_LIST DESIRED_TARGETS)
+        list(APPEND GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL ${target})
+        message(STATUS "  Adding GPU target: ${target}")
+    endif()
+endforeach()
+
+# Skip build if no matching targets found
+if(NOT GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL)
+    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+else()
+    message(STATUS "Building individual GEMM Preshuffle targets for GPU targets: ${GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL}")
+
+    # Enable parallel compilation optimizations
+    # Set up job pools for better parallel compilation control
+    set_property(GLOBAL PROPERTY JOB_POOLS 
+        compile_heavy=4    # Limit heavy compilations to prevent OOM
+        compile_normal=16  # Allow more parallel normal compilations
+    )
+
+    # Enable compiler cache if available and explicitly requested
+    # Disabled by default due to permission issues in CI environments
+    if(ENABLE_CCACHE_GEMM_PRESHUFFLE)
+        find_program(CCACHE_PROGRAM ccache)
+        if(CCACHE_PROGRAM)
+            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+            message(STATUS "Using ccache for faster compilation")
+        else()
+            message(WARNING "ccache requested but not found")
+        endif()
+    else()
+        message(STATUS "ccache disabled for GEMM Preshuffle ops (use -DENABLE_CCACHE_GEMM_PRESHUFFLE=ON to enable)")
+    endif()
+
+    # Create master collection targets
+    add_custom_target(benchmark_gemm_preshuffle_all)
+
+    # Create datatype collection targets
+    foreach(dt IN LISTS GEMM_PRESHUFFLE_DATATYPE)
+        add_custom_target(benchmark_gemm_preshuffle_${dt})
+    endforeach()
+
+    # Create layout collection targets
+    foreach(l IN LISTS GEMM_PRESHUFFLE_LAYOUT)
+        add_custom_target(benchmark_gemm_preshuffle_${l})
+    endforeach()
+
+    # Create combined collection targets
+    foreach(dt IN LISTS GEMM_PRESHUFFLE_DATATYPE)
+        foreach(l IN LISTS GEMM_PRESHUFFLE_LAYOUT)
+            add_custom_target(benchmark_gemm_preshuffle_${dt}_${l})
+        endforeach()
+    endforeach()
+
+    # Create trait-based collection targets
+    # These are common trait components used across all GEMM kernels
+    set(GEMM_PRESHUFFLE_PIPELINES "preshufflev1;preshufflev2")
+    set(GEMM_PRESHUFFLE_EPILOGUES "default;cshuffle")
+    set(GEMM_PRESHUFFLE_SCHEDULERS "intrawave;interwave;default") 
+
+    foreach(pipeline IN LISTS GEMM_PRESHUFFLE_PIPELINES)
+        add_custom_target(benchmark_gemm_preshuffle_${pipeline}_pipeline)
+    endforeach()
+
+    foreach(epilogue IN LISTS GEMM_PRESHUFFLE_EPILOGUES)
+        add_custom_target(benchmark_gemm_preshuffle_${epilogue}_epilogue)
+    endforeach()
+
+    foreach(scheduler IN LISTS GEMM_PRESHUFFLE_SCHEDULERS)
+        add_custom_target(benchmark_gemm_preshuffle_${scheduler}_scheduler)
+    endforeach()
+
+    # Build individual targets for each datatype/layout combination
+    
+    foreach(dt IN LISTS GEMM_PRESHUFFLE_DATATYPE)
+        foreach(l IN LISTS GEMM_PRESHUFFLE_LAYOUT)
+            build_individual_gemm_preshuffle_targets(${dt} ${l})
+        endforeach()
+    endforeach()
+endif()
diff --git a/tile_engine/ops/gemm_preshuffle/benchmark_gemm_preshuffle.hpp b/tile_engine/ops/gemm_preshuffle/benchmark_gemm_preshuffle.hpp
new file mode 100644
index 0000000000..74fccf6bf2
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/benchmark_gemm_preshuffle.hpp
@@ -0,0 +1,225 @@
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_preshuffle_common.hpp"
+
+//[TODO] Move parts of this File to commons
+enum class Metric
+{
+    LATENCY   = 0,
+    TFLOPS    = 1,
+    BANDWIDTH = 2
+};
+
+inline constexpr auto get_metric_name(Metric m)
+{
+    switch(m)
+    {
+    case Metric::LATENCY: return "latency";
+    case Metric::TFLOPS: return "tflops";
+    case Metric::BANDWIDTH: return "bandwidth";
+    default: throw std::invalid_argument("Unsupported metric type");
+    }
+}
+
+struct GemmProblem
+{
+    int split_k_;
+    int m_, n_, k_;
+    int stride_a_, stride_b_, stride_c_;
+
+    std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_;
+    std::string layout_a_, layout_b_, layout_c_;
+
+    bool structured_sparsity_;
+
+    friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem)
+    {
+        os << "{\n"
+           << "   \"split_k\":" << problem.split_k_ << ",\n"
+           << "   \"m\":" << problem.m_ << ",\n"
+           << "   \"n\":" << problem.n_ << ",\n"
+           << "   \"k\":" << problem.k_ << ",\n"
+           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
+           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
+           << "   \"stride_c\":" << problem.stride_c_ << ",\n"
+           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
+           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
+           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
+           << "   \"dtype_c\":\"" << problem.dtype_c_ << "\",\n"
+           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
+           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
+           << "   \"layout_c\":\"" << problem.layout_c_ << "\",\n"
+           << "   \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false")
+           << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct PerformanceResult
+{
+    double latency_;
+    double tflops_;
+    double bandwidth_;
+
+    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
+    {
+        switch(m)
+        {
+        case Metric::LATENCY: return a.latency_ < b.latency_;
+        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
+        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
+        default: throw std::invalid_argument("Unsupported metric type");
+        }
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
+    {
+        os << "{\n"
+           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
+           << ",\n"
+           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
+           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct KernelInstance
+{
+    std::string name_;
+    GemmProblem problem_;
+    PerformanceResult perf_result_;
+
+    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
+    {
+        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
+    {
+        os << "{\n"
+           << " \"name\": \"" << obj.name_ << "\",\n"
+           << " \"problem\": " << obj.problem_ << ",\n"
+           << " \"perf_result\": " << obj.perf_result_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct Setting
+{
+    int n_warmup_;
+    int n_repeat_;
+    bool is_gpu_timer_;
+    int verify_;
+    int init_method_;
+    bool log_;
+    std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
+    bool json_output_;
+};
+
+inline std::string get_rocm_version()
+{
+    std::ifstream version_file("/opt/rocm/.info/version");
+    if(version_file.is_open())
+    {
+        std::string version;
+        std::getline(version_file, version);
+        return version;
+    }
+    return "Unknown";
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+/// @brief Function to compare the results of the device and host computations
+bool compare(std::string instanceName,
+             ck_tile::index_t K,
+             ck_tile::index_t kbatch,
+             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+             ck_tile::HostTensor<CDataType>& c_m_n_ref)
+{
+    const float max_accumulated_value =
+        *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
+    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+        K, kbatch, max_accumulated_value);
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_ref,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "For " << instanceName << " Relative error threshold is "
+              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
+              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
+
+/// @brief Function to get the kernel output with reference implementation on CPU/GPU
+void gemm_host_reference(int verify,
+                         ck_tile::HostTensor<ADataType>& a_m_k,
+                         ck_tile::HostTensor<BDataType>& b_k_n,
+                         ck_tile::HostTensor<CDataType>& c_m_n_ref,
+                         ck_tile::DeviceMem& a_m_k_dev_buf,
+                         ck_tile::DeviceMem& b_k_n_dev_buf,
+                         ck_tile::index_t M,
+                         ck_tile::index_t N,
+                         ck_tile::index_t K,
+                         ck_tile::index_t stride_A,
+                         ck_tile::index_t stride_B,
+                         ck_tile::index_t stride_C)
+{
+    if(verify == 1)
+    {
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_ref);
+    }
+    else if(verify == 2)
+    {
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_ref.get_element_space_size_in_bytes());
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_ref.data());
+    }
+}
diff --git a/tile_engine/ops/gemm_preshuffle/benchmark_gemm_preshuffle_single.cpp b/tile_engine/ops/gemm_preshuffle/benchmark_gemm_preshuffle_single.cpp
new file mode 100644
index 0000000000..152e27e77e
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/benchmark_gemm_preshuffle_single.cpp
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <functional>
+#include <tuple>
+#include <exception>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_preshuffle_profiler.hpp"
+#include "gemm_preshuffle_common.hpp"
+
+// The kernel header is included via the compile command line with -include flag
+// It defines SelectedKernel struct and KERNEL_NAME
+// DataTypeTraits are now defined in gemm_common.hpp
+
+// Create argument parser
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("verify",
+                "2",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
+                "for validation on GPU. Default is 0, no validation.")
+        .insert("log",
+                "false",
+                "Whether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert(
+            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
+        .insert(
+            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Whether if the timer is gpu timer or not. Possible values are false or true. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "true",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert("csv_filename",
+                "",
+                "The filename of benchmark result. Default is empty (no CSV output).")
+        .insert("structured_sparsity",
+                "false",
+                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
+                "false")
+        .insert("json_output",
+                "false",
+                "Whether to output results in JSON format only. Possible values are true or false. "
+                "Default is "
+                "false");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+void benchmark_gemm_preshuffle_single(const ck_tile::ArgParser& arg_parser)
+{
+    // Use DataTypeTraits to get the actual type names from the generated header
+    // The generated header defines ADataType, BDataType, AccDataType, CDataType
+    std::string dtype_a   = DataTypeTraits<ADataType>::name;
+    std::string dtype_b   = DataTypeTraits<BDataType>::name;
+    std::string dtype_acc = DataTypeTraits<AccDataType>::name;
+    std::string dtype_c   = DataTypeTraits<CDataType>::name;
+
+    // Layout names from the layout types
+    std::string layout_a = ALayout::name;
+    std::string layout_b = BLayout::name;
+    std::string layout_c = CLayout::name;
+
+    // Create GemmProblem struct
+    GemmProblem gemm_problem{arg_parser.get_int("split_k"),
+                             arg_parser.get_int("m"),
+                             arg_parser.get_int("n"),
+                             arg_parser.get_int("k"),
+                             arg_parser.get_int("stride_a"),
+                             arg_parser.get_int("stride_b"),
+                             arg_parser.get_int("stride_c"),
+                             dtype_a,
+                             dtype_b,
+                             dtype_acc,
+                             dtype_c,
+                             layout_a,
+                             layout_b,
+                             layout_c,
+                             arg_parser.get_bool("structured_sparsity")};
+
+    // Create Setting struct
+    Setting setting{arg_parser.get_int("warmup"),
+                    arg_parser.get_int("repeat"),
+                    arg_parser.get_bool("timer"),
+                    arg_parser.get_int("verify"),
+                    arg_parser.get_int("init"),
+                    arg_parser.get_bool("log"),
+                    arg_parser.get_str("csv_filename"),
+                    arg_parser.get_bool("flush_cache"),
+                    arg_parser.get_int("rotating_count"),
+                    arg_parser.get_bool("json_output")};
+
+    // Get the profiler instance
+    auto& profiler = GemmProfiler::instance(setting);
+
+    try
+    {
+        // Create a lambda that wraps the kernel launch
+
+        std::tuple<int, int, int> warp_tile_dims = std::make_tuple(
+            SelectedKernel::WarpTileM, SelectedKernel::WarpTileN, SelectedKernel::WarpTileK);
+
+        auto kernel_func = [](const ck_tile::GemmHostArgs& args,
+                              const ck_tile::stream_config& stream) {
+            return SelectedKernel::launch(args, stream);
+        };
+
+        // Benchmark the kernel
+        profiler.benchmark(gemm_problem, kernel_func, warp_tile_dims);
+
+        // Select best instance based on metric
+        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Benchmark failed: " << e.what() << std::endl;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, parser] = create_args(argc, argv);
+        if(!result)
+            return EXIT_FAILURE;
+
+        benchmark_gemm_preshuffle_single(parser);
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << "\n";
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tile_engine/ops/gemm_preshuffle/commons/validation_utils.py b/tile_engine/ops/gemm_preshuffle/commons/validation_utils.py
new file mode 100644
index 0000000000..2bc42f1ce7
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/commons/validation_utils.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+"""
+Validation utilities for GEMM kernel generation.
+Extracted from tile_engine_develop for consistency.
+"""
+
+import subprocess
+import re
+from functools import lru_cache
+import logging
+from typing import Tuple, List
+
+# Element size mapping for different data types
+ELEMENT_SIZE_MAP = {
+    "fp16": 2,
+    "bf16": 2,
+    "int8": 1,
+    "fp8": 1,
+    "bf8": 1,
+    "int4": 0.5,
+    "int32": 4,
+    "fp32": 4,
+    "fp64": 8,
+}
+
+# [TODO] Handle this while moving code to commons
+# Supported warp tile combinations for different GPU architectures and data types
+WARP_TILE_SUPPORTED_COMBINATIONS = {
+    "gfx90a": {
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32]],
+        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32]],
+    },
+    "gfx942": {
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
+        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
+        "int8_int8_int32": [[16, 16, 32], [32, 32, 16]],
+    },
+    "gfx950": {
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [
+            [32, 32, 16],
+            [32, 32, 32],
+            [16, 16, 32],
+            [16, 16, 64],
+            [16, 16, 128],
+            [32, 32, 64],
+        ],
+        "bf8_bf8_fp16": [
+            [32, 32, 16],
+            [32, 32, 32],
+            [16, 16, 64],
+            [16, 16, 32],
+            [16, 16, 128],
+            [32, 32, 64],
+        ],
+    },
+}
+
+# Unsupported trait combinations
+TRAIT_UNSUPPORTED_COMBINATIONS = {
+    ("compv3", "cshuffle", "interwave"),
+    ("compv3", "default", "interwave"),
+    ("compv4", "cshuffle", "interwave"),
+    ("compv4", "default", "interwave"),
+}
+
+
+def element_size(data_type: str) -> float:
+    """Calculate the size (in bytes) of a single element for given data type."""
+    data_type = data_type.lower()
+    if data_type not in ELEMENT_SIZE_MAP:
+        raise ValueError(f"Unsupported data type: {data_type}")
+    return ELEMENT_SIZE_MAP[data_type]
+
+
+GPU_NAME_PATTERN = re.compile(r"Name:\s*(gfx\d+\w*)")
+
+
+@lru_cache(maxsize=1)
+def get_gpu_name_by_id(gpu_id: int = 0) -> str:
+    """Retrieve GPU name (e.g. gfx90a) by device ID"""
+    try:
+        output = subprocess.check_output(
+            ["rocminfo"], text=True, stderr=subprocess.PIPE, timeout=5
+        )
+        if matches := GPU_NAME_PATTERN.finditer(output):
+            gpu_list = [m.group(1) for m in matches]
+            return gpu_list[gpu_id] if gpu_id < len(gpu_list) else ""
+
+        return ""
+
+    except subprocess.CalledProcessError as e:
+        logging.debug(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}")
+    except FileNotFoundError:
+        logging.debug("ROCm tools not installed (requires rocminfo)")
+    except subprocess.TimeoutExpired:
+        logging.debug("GPU query timeout (5s)")
+    except Exception as e:
+        logging.debug(f"GPU detection error: {str(e)}")
+
+    return ""
+
+
+def is_trait_combination_valid(pipeline: str, epilogue: str, scheduler: str) -> bool:
+    """Check if a trait combination is valid."""
+    return (pipeline, epilogue, scheduler) not in TRAIT_UNSUPPORTED_COMBINATIONS
+
+
+def validate_warp_configuration(warp_m: int, warp_n: int, warp_k: int) -> bool:
+    """Validate warp configuration."""
+    return (warp_m, warp_n, warp_k) in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]
+
+
+def validate_dimension_alignment(
+    tile_m: int,
+    tile_n: int,
+    tile_k: int,
+    warp_m: int,
+    warp_n: int,
+    warp_k: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    warp_tile_k: int,
+) -> Tuple[bool, List[str]]:
+    """Check if tile dimensions are properly aligned with warp dimensions."""
+    alignment_issues = []
+
+    if tile_m % (warp_m * warp_tile_m) != 0:
+        alignment_issues.append(
+            f"tile_m({tile_m}) % [{warp_m}x{warp_tile_m}] = {tile_m % (warp_m * warp_tile_m)}"
+        )
+    if tile_n % (warp_n * warp_tile_n) != 0:
+        alignment_issues.append(
+            f"tile_n({tile_n}) % [{warp_n}x{warp_tile_n}] = {tile_n % (warp_n * warp_tile_n)}"
+        )
+    if tile_k % (warp_k * warp_tile_k) != 0:
+        alignment_issues.append(
+            f"tile_k({tile_k}) % [{warp_k}x{warp_tile_k}] = {tile_k % (warp_k * warp_tile_k)}"
+        )
+
+    return len(alignment_issues) == 0, alignment_issues
+
+
+def validate_lds_capacity(
+    tile_m: int,
+    tile_n: int,
+    tile_k: int,
+    a_datatype: str,
+    b_datatype: str,
+    pipeline: str,
+) -> Tuple[bool, str]:
+    """Validate LDS capacity requirements."""
+    matrix_a_size = (tile_m * tile_k) * element_size(a_datatype)
+    matrix_b_size = (tile_n * tile_k) * element_size(b_datatype)
+    total_tile_in_lds = matrix_a_size + matrix_b_size
+
+    max_tile_size = 2**15 if pipeline == "compv4" else 2**16
+
+    if total_tile_in_lds > max_tile_size:
+        error_msg = (
+            f"LDS capacity exceeded: Total required {total_tile_in_lds:,}B ({total_tile_in_lds / 1024:.1f}KB) > "
+            f"maximum allowed {max_tile_size:,}B ({max_tile_size / 1024}KB). Breakdown:\n"
+            f"- Matrix A ({a_datatype}): {tile_m}x{tile_k} = {matrix_a_size:,}B\n"
+            f"- Matrix B ({b_datatype}): {tile_n}x{tile_k} = {matrix_b_size:,}B"
+        )
+        return False, error_msg
+
+    return True, ""
+
+
+def validate_warp_tile_combination(
+    warp_tile_m: int,
+    warp_tile_n: int,
+    warp_tile_k: int,
+    a_datatype: str,
+    b_datatype: str,
+    c_datatype: str,
+    gpu_name: str = None,
+) -> Tuple[bool, str]:
+    """Validate warp tile combination against GPU-specific supported combinations."""
+    if gpu_name is None:
+        gpu_name = get_gpu_name_by_id(0)
+
+    # Construct the key for looking up supported combinations
+    warp_tile_key = f"{a_datatype}_{b_datatype}_{c_datatype}"
+    current_combination = [warp_tile_m, warp_tile_n, warp_tile_k]
+
+    # Check if we have GPU-specific combinations
+    gpu_warp_tile_combinations = WARP_TILE_SUPPORTED_COMBINATIONS.get(gpu_name, {})
+    if not gpu_warp_tile_combinations:
+        # If GPU not recognized, try to be permissive but log warning
+        logging.warning(f"No warp tile combinations found for GPU: {gpu_name}")
+        return True, ""
+
+    # Check if we have combinations for this data type combination
+    allowed_combinations = gpu_warp_tile_combinations.get(warp_tile_key, [])
+    if not allowed_combinations:
+        # For data type combinations not in the list, be permissive
+        logging.debug(
+            f"No warp tile combinations found for data types: {warp_tile_key}"
+        )
+        return True, ""
+
+    # Check if current combination is in the allowed list
+    if current_combination not in allowed_combinations:
+        error_msg = (
+            f"Invalid warp tile combination: {current_combination} not in allowed list. "
+            f"Valid combinations for '{warp_tile_key}' on {gpu_name}: {allowed_combinations}"
+        )
+        return False, error_msg
+
+    return True, ""
+
+
+def is_tile_config_valid(
+    tile_m: int,
+    tile_n: int,
+    tile_k: int,
+    warp_m: int,
+    warp_n: int,
+    warp_k: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    warp_tile_k: int,
+    a_datatype: str,
+    b_datatype: str,
+    c_datatype: str,
+    pipeline: str,
+    trait_name: str = None,
+) -> bool:
+    """
+    Comprehensive tile configuration validation.
+    Returns True if configuration is valid, False otherwise.
+    """
+    # Basic sanity checks
+    if tile_m <= 0 or tile_n <= 0 or tile_k <= 0:
+        return False
+    if warp_m <= 0 or warp_n <= 0 or warp_k <= 0:
+        return False
+    if warp_tile_m <= 0 or warp_tile_n <= 0 or warp_tile_k <= 0:
+        return False
+
+    # Check that warp tiles fit within block tiles
+    if warp_m * warp_tile_m > tile_m:
+        return False
+    if warp_n * warp_tile_n > tile_n:
+        return False
+    if warp_k * warp_tile_k > tile_k:
+        return False
+
+    # Validate warp configuration
+    if not validate_warp_configuration(warp_m, warp_n, warp_k):
+        logging.debug(
+            f"Invalid warp configuration: warp_m({warp_m}), warp_n({warp_n}), warp_k({warp_k})"
+        )
+        return False
+
+    # Validate dimension alignment
+    is_aligned, alignment_issues = validate_dimension_alignment(
+        tile_m,
+        tile_n,
+        tile_k,
+        warp_m,
+        warp_n,
+        warp_k,
+        warp_tile_m,
+        warp_tile_n,
+        warp_tile_k,
+    )
+    if not is_aligned:
+        logging.debug(
+            f"Dimension alignment failed: {', '.join(alignment_issues)}. "
+            f"Tile dimensions {tile_m}x{tile_n}x{tile_k} must be divisible by "
+            f"[warp]: {warp_m}x{warp_n}x{warp_k} x [warp_tile]: {warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+        )
+        return False
+
+    # Validate LDS capacity
+    lds_valid, lds_error = validate_lds_capacity(
+        tile_m, tile_n, tile_k, a_datatype, b_datatype, pipeline
+    )
+    if not lds_valid:
+        logging.debug(f"LDS validation failed: {lds_error}")
+        return False
+
+    # Validate warp tile combination
+    warp_tile_valid, warp_tile_error = validate_warp_tile_combination(
+        warp_tile_m, warp_tile_n, warp_tile_k, a_datatype, b_datatype, c_datatype
+    )
+    if not warp_tile_valid:
+        logging.debug(f"Warp tile validation failed: {warp_tile_error}")
+        return False
+
+    return True
+
+
+# [TODO] Handle this while moving code to commons Add more datatype to this function if needed
+def get_dtype_string(datatype: str) -> str:
+    """Get C++ type string for datatype"""
+    dtype_map = {
+        "fp16": "ck_tile::fp16_t",
+        "fp8": "ck_tile::fp8_t",
+        "bf16": "ck_tile::bf16_t",
+        "fp32": "float",
+        "fp64": "double",
+    }
+    return dtype_map.get(datatype, "float")
+
+
+LAYOUT_MAP = {
+    "r": "ck_tile::tensor_layout::gemm::RowMajor",
+    "c": "ck_tile::tensor_layout::gemm::ColumnMajor",
+}
+
+
+def get_abc_layouts(layout_code: str) -> Tuple[str, str, str]:
+    """
+    Return (ALayout, BLayout, CLayout) from a 3-letter code like 'rcr', 'ccr', 'crr', 'rrr'.
+    """
+    code = str(layout_code).strip().lower()
+
+    a_layout = LAYOUT_MAP[code[0]]
+    b_layout = LAYOUT_MAP[code[1]]
+    c_layout = LAYOUT_MAP[code[2]]
+    return a_layout, b_layout, c_layout
diff --git a/tile_engine/ops/gemm_preshuffle/configs/default_config.json b/tile_engine/ops/gemm_preshuffle/configs/default_config.json
new file mode 100644
index 0000000000..d4c3537c65
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/configs/default_config.json
@@ -0,0 +1,91 @@
+{
+  "tile_config": {
+    "tile_m": {
+        "values": [
+            128
+        ]
+    },
+    "tile_n": {
+        "values": [
+            128
+        ]
+    },
+    "tile_k": {
+        "values": [
+            128
+        ]
+    },
+    "warp_m": {
+        "values": [
+            1
+        ]
+    },
+    "warp_n": {
+        "values": [
+            4
+        ]
+    },
+    "warp_k": {
+        "values": [
+            1
+        ]
+    },
+    "warp_tile_m": {
+        "values": [
+            16
+        ]
+    },
+    "warp_tile_n": {
+        "values": [
+            16
+        ]
+    },
+    "warp_tile_k": {
+        "values": [
+            16,32
+        ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "preshufflev1",
+        "preshufflev2"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "interwave",
+        "intrawave"
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "default",
+        "cshuffle"
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    },
+    "persistent": {
+        "values": [
+            true,
+            false
+        ]
+    }
+  },
+  "k_block_per_cu": 2
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm_preshuffle/configs/user_provided_config.json b/tile_engine/ops/gemm_preshuffle/configs/user_provided_config.json
new file mode 100644
index 0000000000..c0fc1f6cf8
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/configs/user_provided_config.json
@@ -0,0 +1,87 @@
+{
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        128
+      ]
+    },
+    "tile_n": {
+      "values": [
+        128
+      ]
+    },
+    "tile_k": {
+      "values": [
+        64
+      ]
+    },
+    "warp_m": {
+      "values": [
+        1
+      ]
+    },
+    "warp_n": {
+      "values": [
+        4
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        16,32
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "preshufflev2"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave"
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "default"
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    },
+    "persistent": {
+        "values": [
+            false
+       ]
+    }
+  },
+  "k_block_per_cu": 8
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py
new file mode 100755
index 0000000000..0217a439f2
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py
@@ -0,0 +1,684 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+
+class GemmPreshuffleBenchmark:
+    def __init__(self, build_dir: str, verbose: bool = False):
+        self.build_dir = Path(build_dir)
+        self.verbose = verbose
+        self.results = []
+
+    def discover_kernels(self) -> List[Path]:
+        """Find all benchmark_gemm_preshuffle* executables in the build directory"""
+        bin_dir = self.build_dir / "bin"
+        if not bin_dir.exists():
+            print(f"Error: Binary directory {bin_dir} does not exist")
+            return []
+
+        kernels = list(bin_dir.glob("benchmark_gemm_preshuffle*"))
+        if self.verbose:
+            print(f"Found {len(kernels)} kernel executables")
+            for k in kernels:
+                print(f"  - {k.name}")
+        return kernels
+
+    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
+        """Extract comprehensive kernel information from filename"""
+        name = kernel_path.stem
+
+        # Initialize with basic info
+        info = {
+            "executable": str(kernel_path),
+            "name": name,
+            "data_type": "unknown",
+            "layout": "unknown",
+            "pipeline": "unknown",
+            "scheduler": "unknown",
+            "epilogue": "unknown",
+        }
+
+        # Parse the kernel name pattern:
+        # benchmark_gemm_preshuffle_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
+        parts = name.split("_")
+
+        if len(parts) >= 4:
+            # Extract data type (4rd part after benchmark_gemm_preshuffle_)
+            info["data_type"] = parts[3] if len(parts) > 2 else "unknown"
+
+            # Extract layout (5th part)
+            info["layout"] = parts[4] if len(parts) > 3 else "unknown"
+
+            # Extract pipeline (6th part)
+            info["pipeline"] = parts[5] if len(parts) > 4 else "unknown"
+
+            # Extract epilogue (7th part)
+            info["epilogue"] = parts[6] if len(parts) > 5 else "unknown"
+
+            # Extract scheduler (8th part)
+            info["scheduler"] = parts[7] if len(parts) > 6 else "unknown"
+
+        # Extract detailed configuration from the end of the name
+        config_info = self.parse_detailed_config(name)
+        info.update(config_info)
+
+        # Generate config ID
+        info["config_id"] = self.generate_config_id(info)
+
+        return info
+
+    def parse_detailed_config(self, kernel_name: str) -> Dict:
+        """Parse detailed configuration from kernel name"""
+        config = {
+            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
+            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
+            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
+            "optimization_flags": {
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+            },
+        }
+
+        # Split by underscore and look for patterns
+        parts = kernel_name.split("_")
+
+        # Look for boolean flags (sequence of True/False values)
+        bool_sequence = []
+        for i, part in enumerate(parts):
+            if part in ["True", "False"]:
+                bool_sequence.append(part == "True")
+                # Continue collecting consecutive boolean values
+                j = i + 1
+                while j < len(parts) and parts[j] in ["True", "False"]:
+                    bool_sequence.append(parts[j] == "True")
+                    j += 1
+                break
+
+        # Assign boolean flags if we found them
+        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
+        if len(bool_sequence) >= 4:
+            config["optimization_flags"]["pad_m"] = bool_sequence[0]
+            config["optimization_flags"]["pad_n"] = bool_sequence[1]
+            config["optimization_flags"]["pad_k"] = bool_sequence[2]
+            config["optimization_flags"]["persistent"] = bool_sequence[3]
+
+        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
+        # The pattern is: tile_sizes_warp_config_warp_tile
+        dimension_groups = []
+        for part in parts:
+            if "x" in part and len(part.split("x")) == 3:
+                try:
+                    dims = [int(x) for x in part.split("x")]
+                    if all(d > 0 for d in dims):
+                        dimension_groups.append(dims)
+                except ValueError:
+                    continue
+
+        # Assign dimensions based on order and magnitude
+        if len(dimension_groups) >= 3:
+            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Largest dimensions = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smallest dimensions = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[2][0]
+            config["warp_config"]["warp_n"] = sorted_groups[2][1]
+            config["warp_config"]["warp_k"] = sorted_groups[2][2]
+
+            # Middle dimensions = warp tile
+            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
+            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
+            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 2:
+            # If only 2 groups, assign based on magnitude
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Larger = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smaller = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[1][0]
+            config["warp_config"]["warp_n"] = sorted_groups[1][1]
+            config["warp_config"]["warp_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 1:
+            # Only one group - assume it's tile sizes
+            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
+            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
+            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
+
+        return config
+
+    def generate_config_id(self, info: Dict) -> str:
+        """Generate a compact config ID from kernel info"""
+        # Create a compact identifier
+        parts = [
+            info.get("data_type", "unk"),
+            info.get("layout", "unk"),
+            info.get("pipeline", "unk"),
+            info.get("scheduler", "unk"),
+        ]
+
+        # Add tile configuration if available
+        tile_sizes = info.get("tile_sizes", {})
+        if tile_sizes.get("tile_m", 0) > 0:
+            tile_str = (
+                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
+            )
+            parts.append(tile_str)
+
+        # Add warp config if available
+        warp_config = info.get("warp_config", {})
+        if warp_config.get("warp_m", 0) > 0:
+            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
+            parts.append(warp_str)
+
+        # Add warp tile if available
+        warp_tile = info.get("warp_tile", {})
+        if warp_tile.get("warp_tile_m", 0) > 0:
+            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
+            parts.append(warp_tile_str)
+
+        return "_".join(parts)
+
+    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
+        """Run a single kernel with given parameters and save output to individual JSON file"""
+        # Create results directory
+        results_dir = self.build_dir / "results"
+        results_dir.mkdir(exist_ok=True)
+
+        # Generate unique JSON filename for this kernel
+        json_file = results_dir / f"{kernel_path.stem}.json"
+
+        cmd = [str(kernel_path)]
+
+        # Add parameters
+        for key, value in params.items():
+            cmd.append(f"-{key}={value}")
+
+        # Add JSON output flag for clean JSON output
+        cmd.append("-json_output=true")
+
+        if self.verbose:
+            print(f"Running: {' '.join(cmd)}")
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+
+            if result.returncode != 0:
+                print(f"Error running {kernel_path.name}: {result.stderr}")
+                return None
+
+            # Save raw output to individual JSON file
+            output = result.stdout.strip()
+
+            if output:
+                with open(json_file, "w") as f:
+                    f.write(output)
+
+                # Parse the JSON file
+                return self.parse_json_file(json_file)
+            else:
+                print(f"No output from {kernel_path.name}")
+                return None
+
+        except subprocess.TimeoutExpired:
+            print(f"Timeout running {kernel_path.name}")
+            return None
+        except Exception as e:
+            print(f"Error running {kernel_path.name}: {e}")
+            return None
+
+    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
+        """Parse JSON data from individual kernel output file"""
+        try:
+            with open(json_file, "r") as f:
+                content = f.read().strip()
+
+            # Parse the JSON directly since executables produce clean JSON
+            data = json.loads(content)
+
+            # Return the complete JSON data as-is, just add some convenience fields
+            result = data.copy()
+            if "perf_result" in data:
+                perf = data["perf_result"]
+                # Add convenience fields for backward compatibility
+                result["time_ms"] = perf.get("latency(ms)", 0)
+                result["tflops"] = perf.get("tflops(TFlops)", 0)
+                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
+
+            return result
+
+        except json.JSONDecodeError as e:
+            if self.verbose:
+                print(f"Failed to parse JSON from {json_file}: {e}")
+            return None
+        except Exception as e:
+            if self.verbose:
+                print(f"Error reading JSON file {json_file}: {e}")
+            return None
+
+    def benchmark_problem_size(
+        self,
+        kernels: List[Path],
+        m: int,
+        n: int,
+        k: int,
+        split_k: int = 1,
+        verify: int = 0,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> List[Dict]:
+        """Benchmark all kernels for a specific problem size"""
+        results = []
+
+        params = {
+            "m": m,
+            "n": n,
+            "k": k,
+            "split_k": split_k,
+            "verify": verify,
+            "warmup": warmup,
+            "repeat": repeat,
+            "flush_cache": str(flush_cache).lower(),
+            "rotating_count": rotating_count,
+        }
+
+        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
+
+        for kernel_path in kernels:
+            kernel_info = self.extract_kernel_info(kernel_path)
+            result = self.run_kernel(kernel_path, params)
+
+            if result:
+                # Create new structured result format
+                structured_result = {
+                    "name": kernel_info["name"],  # Add name field for compatibility
+                    "config_id": kernel_info["config_id"],
+                    "problem": result.get("problem", {}),
+                    "perf_result": result.get("perf_result", {}),
+                    "config": {
+                        "data_type": kernel_info["data_type"],
+                        "layout": kernel_info["layout"],
+                        "pipeline": kernel_info["pipeline"],
+                        "scheduler": kernel_info["scheduler"],
+                        "epilogue": kernel_info["epilogue"],
+                        "tile_sizes": kernel_info.get("tile_sizes", {}),
+                        "warp_config": kernel_info.get("warp_config", {}),
+                        "warp_tile": kernel_info.get("warp_tile", {}),
+                        "optimization_flags": kernel_info.get("optimization_flags", {}),
+                    },
+                    "executable": kernel_info["executable"],
+                    # Keep backward compatibility fields
+                    "time_ms": result.get("time_ms", 0),
+                    "tflops": result.get("tflops", 0),
+                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
+                }
+
+                results.append(structured_result)
+
+                if self.verbose:
+                    print(
+                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
+                    )
+
+        return results
+
+    def find_best_kernel(
+        self, results: List[Dict], metric: str = "tflops"
+    ) -> Optional[Dict]:
+        """Find the best performing kernel based on metric"""
+        if not results:
+            return None
+
+        if metric == "tflops":
+            return max(results, key=lambda x: x.get("tflops", 0))
+        elif metric == "time_ms":
+            return min(results, key=lambda x: x.get("time_ms", float("inf")))
+        elif metric == "bandwidth_gb_s":
+            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
+        else:
+            raise ValueError(f"Unknown metric: {metric}")
+
+    def benchmark_sweep(
+        self,
+        problem_sizes: List[Tuple[int, int, int]],
+        split_k_values: List[int] = [1],
+        verify: bool = False,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> Dict:
+        """Run comprehensive benchmark sweep"""
+        kernels = self.discover_kernels()
+        if not kernels:
+            print("No kernels found!")
+            return {}
+
+        all_results = []
+        best_kernels = {}
+
+        for m, n, k in problem_sizes:
+            for split_k in split_k_values:
+                results = self.benchmark_problem_size(
+                    kernels,
+                    m,
+                    n,
+                    k,
+                    split_k,
+                    verify=2 if verify else 0,
+                    warmup=warmup,
+                    repeat=repeat,
+                    flush_cache=flush_cache,
+                    rotating_count=rotating_count,
+                )
+
+                all_results.extend(results)
+
+                # Find best kernel for this configuration
+                best = self.find_best_kernel(results)
+                if best:
+                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
+                    best_kernels[key] = best
+                    print(
+                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
+                    )
+
+        self.results = all_results
+        return best_kernels
+
+    def export_csv(self, filename: str):
+        """Export all results to CSV"""
+        if not self.results:
+            print("No results to export")
+            return
+
+        # Get all unique keys from results
+        all_keys = set()
+        for result in self.results:
+            all_keys.update(result.keys())
+
+        # Sort keys for consistent output
+        fieldnames = sorted(all_keys)
+
+        with open(filename, "w", newline="") as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(self.results)
+
+        print(f"Results exported to {filename}")
+
+    def export_best_kernels(self, best_kernels: Dict, filename: str):
+        """Export best kernel selections to file"""
+        with open(filename, "w") as f:
+            f.write("# Best kernel selections\n")
+            f.write(
+                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
+            )
+
+            for key, kernel in sorted(best_kernels.items()):
+                f.write(
+                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
+                )
+
+        print(f"Best kernels exported to {filename}")
+
+    def export_json(self, filename: str, best_kernels: Dict = None):
+        """Export all results and best kernels to JSON with comprehensive metadata"""
+        from datetime import datetime
+
+        # Calculate comprehensive summary statistics for all metrics
+        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
+
+        tflops_values = [r.get("tflops", 0) for r in successful_results]
+        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
+        latency_values = [
+            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
+        ]
+
+        # Performance breakdown by kernel type
+        pipeline_stats = {}
+        scheduler_stats = {}
+        data_type_stats = {}
+
+        for result in successful_results:
+            # Get config info from the new structure
+            config = result.get("config", {})
+
+            # Pipeline statistics
+            pipeline = config.get("pipeline", "unknown")
+            if pipeline not in pipeline_stats:
+                pipeline_stats[pipeline] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            pipeline_stats[pipeline]["count"] += 1
+            pipeline_stats[pipeline]["best_tflops"] = max(
+                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
+            )
+
+            # Scheduler statistics
+            scheduler = config.get("scheduler", "unknown")
+            if scheduler not in scheduler_stats:
+                scheduler_stats[scheduler] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            scheduler_stats[scheduler]["count"] += 1
+            scheduler_stats[scheduler]["best_tflops"] = max(
+                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
+            )
+
+            # Data type statistics
+            data_type = config.get("data_type", "unknown")
+            if data_type not in data_type_stats:
+                data_type_stats[data_type] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            data_type_stats[data_type]["count"] += 1
+            data_type_stats[data_type]["best_tflops"] = max(
+                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
+            )
+
+        # Calculate averages for breakdown stats
+        for stats_dict, field_name in [
+            (pipeline_stats, "pipeline"),
+            (scheduler_stats, "scheduler"),
+            (data_type_stats, "data_type"),
+        ]:
+            for key in stats_dict:
+                relevant_results = [
+                    r
+                    for r in successful_results
+                    if r.get("config", {}).get(field_name, "unknown") == key
+                ]
+                if relevant_results:
+                    stats_dict[key]["avg_tflops"] = sum(
+                        r.get("tflops", 0) for r in relevant_results
+                    ) / len(relevant_results)
+
+        output_data = {
+            "benchmark_metadata": {
+                "timestamp": datetime.now().isoformat(),
+                "total_kernels_tested": len(self.results),
+                "unique_kernels": len(
+                    set(r.get("name", "unknown") for r in self.results)
+                ),
+                "successful_runs": len(successful_results),
+                "failed_runs": len(self.results) - len(successful_results),
+            },
+            "performance_summary": {
+                "tflops_stats": {
+                    "best": max(tflops_values, default=0),
+                    "average": sum(tflops_values) / len(tflops_values)
+                    if tflops_values
+                    else 0,
+                    "min": min(tflops_values, default=0),
+                    "median": sorted(tflops_values)[len(tflops_values) // 2]
+                    if tflops_values
+                    else 0,
+                },
+                "bandwidth_stats": {
+                    "best_gb_s": max(bandwidth_values, default=0),
+                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
+                    if bandwidth_values
+                    else 0,
+                    "min_gb_s": min(bandwidth_values, default=0),
+                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
+                    if bandwidth_values
+                    else 0,
+                },
+                "latency_stats": {
+                    "best_ms": min(latency_values, default=0),
+                    "average_ms": sum(latency_values) / len(latency_values)
+                    if latency_values
+                    else 0,
+                    "max_ms": max(latency_values, default=0),
+                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
+                    if latency_values
+                    else 0,
+                },
+                "kernel_type_breakdown": {
+                    "by_pipeline": pipeline_stats,
+                    "by_scheduler": scheduler_stats,
+                    "by_data_type": data_type_stats,
+                },
+                "total_problem_configurations": len(best_kernels)
+                if best_kernels
+                else 0,
+            },
+            "kernel_results": self.results,
+            "best_kernels_by_problem": best_kernels or {},
+        }
+
+        with open(filename, "w") as f:
+            json.dump(output_data, f, indent=2)
+
+        print(f"JSON results exported to {filename}")
+        print(f"  - Total kernels: {len(self.results)}")
+        print(f"  - Successful runs: {len(successful_results)}")
+        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
+        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
+        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="GEMM Preshuffle Kernel Benchmarking Tool"
+    )
+    parser.add_argument(
+        "build_dir", help="Build directory containing kernel executables"
+    )
+    parser.add_argument(
+        "--problem-sizes",
+        nargs="+",
+        default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"],
+        help="Problem sizes as M,N,K tuples",
+    )
+    parser.add_argument(
+        "--split-k", nargs="+", type=int, default=[1], help="Split-K values to test"
+    )
+    parser.add_argument("--verify", action="store_true", help="Enable verification")
+    parser.add_argument(
+        "--csv",
+        default="gemm_preshuffle_benchmark_results.csv",
+        help="CSV output filename",
+    )
+    parser.add_argument(
+        "--best", default="best_kernels.txt", help="Best kernels output filename"
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=50,
+        help="Number of warmup iterations (default: 50)",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations (default: 100)",
+    )
+    parser.add_argument(
+        "--flush-cache",
+        action="store_true",
+        default=True,
+        help="Enable cache flushing (default: True)",
+    )
+    parser.add_argument(
+        "--rotating-count",
+        type=int,
+        default=1000,
+        help="Number of iterations to rotate cache (default: 1000)",
+    )
+    parser.add_argument("--json", help="JSON output filename (optional)")
+
+    args = parser.parse_args()
+
+    # Parse problem sizes
+    problem_sizes = []
+    for size_str in args.problem_sizes:
+        try:
+            m, n, k = map(int, size_str.split(","))
+            problem_sizes.append((m, n, k))
+        except ValueError:
+            print(f"Invalid problem size: {size_str}")
+            return 1
+
+    # Create benchmark instance
+    benchmark = GemmPreshuffleBenchmark(args.build_dir, verbose=args.verbose)
+
+    # Run benchmark sweep
+    print("Starting GEMM Preshuffle kernel benchmark sweep...")
+    start_time = time.time()
+
+    best_kernels = benchmark.benchmark_sweep(
+        problem_sizes=problem_sizes,
+        split_k_values=args.split_k,
+        verify=args.verify,
+        warmup=args.warmup,
+        repeat=args.repeat,
+        flush_cache=args.flush_cache,
+        rotating_count=args.rotating_count,
+    )
+
+    elapsed_time = time.time() - start_time
+    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
+
+    # Export results
+    benchmark.export_csv(args.csv)
+    benchmark.export_best_kernels(best_kernels, args.best)
+
+    # Export JSON if requested
+    if args.json:
+        benchmark.export_json(args.json, best_kernels)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_common.hpp b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_common.hpp
new file mode 100644
index 0000000000..4fb98dc3c2
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_common.hpp
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/pk_int4.hpp"
+
+//[TODO] This can be moved to commons
+// DataTypeTraits for all supported types
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
+// Helper function to determine if a layout is row-major
+template <typename Layout>
+constexpr auto is_row_major(Layout)
+{
+    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+// // Permutation function for pk_int4_t
+// template <typename Tensor>
+// void permute_vectors_i4x4_b(Tensor& tensor)
+// {
+//     const ck_tile::index_t K = tensor.get_length(0);
+//     const ck_tile::index_t N = tensor.get_length(1);
+//     // vector pk_i4x4 permute
+//     for(int i = 0; i < N; i++)
+//     {
+//         for(int j = 0; j < K; j += 8)
+//         {
+//             int8_t input[8];
+
+//             for(int k = 0; k < 4; k++)
+//             {
+//                 int8_t i4x2      = tensor(j + k * 2, i).data;
+//                 input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+//                 input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+//             }
+
+//             // permute 01234567->20643175
+//             {
+//                 int8_t hi        = input[2];
+//                 int8_t lo        = input[0];
+//                 int8_t i4x2      = (hi << 4) | lo;
+//                 tensor(j + 0, i) = i4x2;
+//             }
+
+//             {
+//                 int8_t hi        = input[6];
+//                 int8_t lo        = input[4];
+//                 int8_t i4x2      = (hi << 4) | lo;
+//                 tensor(j + 2, i) = i4x2;
+//             }
+
+//             {
+//                 int8_t hi        = input[3];
+//                 int8_t lo        = input[1];
+//                 int8_t i4x2      = (hi << 4) | lo;
+//                 tensor(j + 4, i) = i4x2;
+//             }
+
+//             {
+//                 int8_t hi        = input[7];
+//                 int8_t lo        = input[5];
+//                 int8_t i4x2      = (hi << 4) | lo;
+//                 tensor(j + 6, i) = i4x2;
+//             }
+//         }
+//     }
+// }
+
+// Structure to hold kernel traits for dispatcher
+struct KernelTraits
+{
+    std::string pipeline;  // preshufflev1, preshufflev2
+    std::string scheduler; // intrawave, interwave, default
+    std::string epilogue;  // cshuffle, default
+    bool pad_m;
+    bool pad_n;
+    bool pad_k;
+    bool persistent;
+
+    // Constructor with defaults
+    KernelTraits()
+        : pipeline("preshufflev2"),
+          scheduler("default"),
+          epilogue("default"),
+          pad_m(false),
+          pad_n(false),
+          pad_k(false),
+          persistent(false)
+    {
+    }
+};
+
+// Helper to extract traits from kernel name
+inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
+{
+    KernelTraits traits;
+
+    // Extract pipeline
+    if(kernel_name.find("preshufflev1") != std::string::npos)
+    {
+        traits.pipeline = "preshufflev1";
+    }
+    else if(kernel_name.find("preshufflev2") != std::string::npos)
+    {
+        traits.pipeline = "preshufflev2";
+    }
+
+    // Extract scheduler
+    if(kernel_name.find("interwave") != std::string::npos)
+    {
+        traits.scheduler = "interwave";
+    }
+    else if(kernel_name.find("intrawave") != std::string::npos)
+    {
+        traits.scheduler = "intrawave";
+    }
+    else
+    {
+        traits.scheduler = "default";
+    }
+
+    // Extract epilogue
+    if(kernel_name.find("default") != std::string::npos &&
+       kernel_name.find("default_") == std::string::npos)
+    {
+        traits.epilogue = "default";
+    }
+    else
+    {
+        traits.epilogue = "cshuffle";
+    }
+
+    // Padding flags would need to be extracted from the kernel configuration
+    // For now, we'll leave them as false
+
+    return traits;
+}
+
+template <typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t,
+               ck_tile::index_t N_Warp_Tile,
+               ck_tile::index_t K_Warp_Tile)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_      = t.get_lengths()[1];
+    int k_      = t.get_lengths()[0];
+    int divisor = N_Warp_Tile == 32 ? 2 : 4;
+    ck_tile::HostTensor<T> t_view(
+        {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+}
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_instance_builder.py b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_instance_builder.py
new file mode 100644
index 0000000000..7734cb3a5e
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_instance_builder.py
@@ -0,0 +1,842 @@
+import argparse
+import os
+import json
+import itertools
+import logging
+import multiprocessing
+import concurrent.futures
+
+from pathlib import Path
+
+from commons.validation_utils import (
+    is_tile_config_valid,
+    is_trait_combination_valid,
+    get_dtype_string,
+    get_abc_layouts,
+)
+
+
+class GemmPreshuffleKernelBuilder:
+    def __init__(self, working_path, datatype, layout, config_json=None):
+        self.working_path = Path(working_path)
+        self.datatype = datatype
+        self.layout = layout
+        self.config_json = config_json
+
+        # Create working directory if it doesn't exist
+        self.working_path.mkdir(parents=True, exist_ok=True)
+
+        # Load configuration
+        if config_json and os.path.exists(config_json):
+            with open(config_json, "r") as f:
+                self.config = json.load(f)
+
+    def write_kernel_list(self):
+        """Write kernel list to file for CMake to read (with comprehensive validation)"""
+        # Get configurations using comprehensive validation
+        tile_configs = self._get_tile_configs(fast_mode=False)
+        trait_combos = self._generate_trait_combinations()
+
+        kernel_list = []
+        for tile_config in tile_configs:
+            for trait_combo in trait_combos:
+                (
+                    pipeline,
+                    epilogue,
+                    scheduler,
+                    pad_m,
+                    pad_n,
+                    pad_k,
+                    persistent,
+                ) = trait_combo
+
+                # Create kernel name with proper boolean capitalization
+                kernel_name = f"gemm_preshuffle_{self.datatype}_{self.layout}_{pipeline}_{epilogue}_{scheduler}_{str(pad_m).capitalize()}_{str(pad_n).capitalize()}_{str(pad_k).capitalize()}_{str(persistent).capitalize()}"
+
+                # Create tile configuration string
+                tile_str = f"{tile_config['tile_m']}x{tile_config['tile_n']}x{tile_config['tile_k']}_"
+                tile_str += f"{tile_config['warp_m']}x{tile_config['warp_n']}x{tile_config['warp_k']}_"
+                tile_str += f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}x{tile_config['warp_tile_k']}"
+
+                kernel_name += f"_{tile_str}"
+
+                kernel_list.append(
+                    {
+                        "name": kernel_name,
+                        "tile_config": tile_config,
+                        "trait_combo": trait_combo,
+                    }
+                )
+
+        # Write kernel count
+        with open(self.working_path / "gemm_preshuffle_kernel_count.txt", "w") as f:
+            f.write(str(len(kernel_list)))
+
+        # Write kernel list
+        with open(self.working_path / "gemm_preshuffle_kernel_list.txt", "w") as f:
+            for kernel in kernel_list:
+                # Format: kernel_name|tile_config|trait_combo
+                tile_config = kernel["tile_config"]
+                trait_combo = kernel["trait_combo"]
+
+                tile_str = f"{tile_config['tile_m']}x{tile_config['tile_n']}x{tile_config['tile_k']}_"
+                tile_str += f"{tile_config['warp_m']}x{tile_config['warp_n']}x{tile_config['warp_k']}_"
+                tile_str += f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}x{tile_config['warp_tile_k']}"
+
+                trait_str = (
+                    f"{trait_combo[0]}_{trait_combo[1]}_{trait_combo[2]}_"
+                    + "_".join(str(x) for x in trait_combo[3:])
+                )
+
+                f.write(f"{kernel['name']}|{tile_str}|{trait_str}\n")
+
+        print(f"Listed {len(kernel_list)} kernel configurations")
+
+    def _get_tile_configs(self, fast_mode=False):
+        """Get tile configurations for the current datatype and layout"""
+        if "tile_configs" in self.config:
+            # Old format
+            return (
+                self.config["tile_configs"].get(self.datatype, {}).get(self.layout, [])
+            )
+        elif "tile_config" in self.config:
+            # New format - generate combinations from individual parameter values
+            tile_config = self.config["tile_config"]
+
+            # Get all possible values for each parameter
+            tile_m_values = tile_config.get("tile_m", {}).get("values", [256])
+            tile_n_values = tile_config.get("tile_n", {}).get("values", [256])
+            tile_k_values = tile_config.get("tile_k", {}).get("values", [32])
+            warp_m_values = tile_config.get("warp_m", {}).get("values", [2])
+            warp_n_values = tile_config.get("warp_n", {}).get("values", [2])
+            warp_k_values = tile_config.get("warp_k", {}).get("values", [1])
+            warp_tile_m_values = tile_config.get("warp_tile_m", {}).get("values", [32])
+            warp_tile_n_values = tile_config.get("warp_tile_n", {}).get("values", [32])
+            warp_tile_k_values = tile_config.get("warp_tile_k", {}).get("values", [32])
+
+            # Generate all combinations
+            configs = []
+            for tile_m in tile_m_values:
+                for tile_n in tile_n_values:
+                    for tile_k in tile_k_values:
+                        for warp_m in warp_m_values:
+                            for warp_n in warp_n_values:
+                                for warp_k in warp_k_values:
+                                    for warp_tile_m in warp_tile_m_values:
+                                        for warp_tile_n in warp_tile_n_values:
+                                            for warp_tile_k in warp_tile_k_values:
+                                                # Validate configuration
+                                                if self._validate_tile_config(
+                                                    tile_m,
+                                                    tile_n,
+                                                    tile_k,
+                                                    warp_m,
+                                                    warp_n,
+                                                    warp_k,
+                                                    warp_tile_m,
+                                                    warp_tile_n,
+                                                    warp_tile_k,
+                                                    fast_mode=fast_mode,
+                                                ):
+                                                    configs.append(
+                                                        {
+                                                            "tile_m": tile_m,
+                                                            "tile_n": tile_n,
+                                                            "tile_k": tile_k,
+                                                            "warp_m": warp_m,
+                                                            "warp_n": warp_n,
+                                                            "warp_k": warp_k,
+                                                            "warp_tile_m": warp_tile_m,
+                                                            "warp_tile_n": warp_tile_n,
+                                                            "warp_tile_k": warp_tile_k,
+                                                        }
+                                                    )
+            return configs
+        else:
+            # Fallback to default
+            return []
+
+    def _generate_trait_combinations(self):
+        """Generate all combinations of traits"""
+        if "traits" in self.config:
+            # Old format
+            traits = self.config["traits"]
+            pipelines = traits["pipelines"]
+            epilogues = traits["epilogues"]
+            schedulers = traits["schedulers"]
+
+            padding = self.config["padding"]
+            persistent = self.config["persistent"]
+
+            all_combinations = list(
+                itertools.product(
+                    pipelines,
+                    epilogues,
+                    schedulers,
+                    padding["pad_m"],
+                    padding["pad_n"],
+                    padding["pad_k"],
+                    persistent,
+                )
+            )
+
+            # Filter out unsupported trait combinations
+            combinations = []
+            for combo in all_combinations:
+                pipeline, epilogue, scheduler = combo[:3]
+                if is_trait_combination_valid(pipeline, epilogue, scheduler):
+                    combinations.append(combo)
+                else:
+                    logging.debug(
+                        f"Skipping unsupported trait combination: {pipeline}-{epilogue}-{scheduler}"
+                    )
+
+        elif "trait_config" in self.config:
+            # New format
+            trait_config = self.config["trait_config"]
+
+            pipelines = trait_config.get("pipeline", {}).get("values", ["preshufflev2"])
+            epilogues = trait_config.get("epilogue", {}).get("values", ["default"])
+            schedulers = trait_config.get("scheduler", {}).get("values", ["default"])
+            pad_m_values = trait_config.get("pad_m", {}).get("values", [False])
+            pad_n_values = trait_config.get("pad_n", {}).get("values", [False])
+            pad_k_values = trait_config.get("pad_k", {}).get("values", [False])
+            persistent_values = trait_config.get("persistent", {}).get(
+                "values", [False]
+            )
+
+            all_combinations = list(
+                itertools.product(
+                    pipelines,
+                    epilogues,
+                    schedulers,
+                    pad_m_values,
+                    pad_n_values,
+                    pad_k_values,
+                    persistent_values,
+                )
+            )
+
+            # Filter out unsupported trait combinations
+            combinations = []
+            for combo in all_combinations:
+                pipeline, epilogue, scheduler = combo[:3]
+                if is_trait_combination_valid(pipeline, epilogue, scheduler):
+                    combinations.append(combo)
+                else:
+                    logging.debug(
+                        f"Skipping unsupported trait combination: {pipeline}-{epilogue}-{scheduler}"
+                    )
+        else:
+            # Fallback to minimal default
+            combinations = [
+                ("preshufflev2", "default", "default", False, False, False, False)
+            ]
+
+        return combinations
+
+    def _validate_tile_config(
+        self,
+        tile_m,
+        tile_n,
+        tile_k,
+        warp_m,
+        warp_n,
+        warp_k,
+        warp_tile_m,
+        warp_tile_n,
+        warp_tile_k,
+        pipeline="preshufflev2",  # Default pipeline for validation
+        fast_mode=False,  # Add fast mode option
+    ):
+        """Validate that tile configuration is reasonable"""
+        if fast_mode:
+            # Fast validation for listing - only basic sanity checks
+            if tile_m <= 0 or tile_n <= 0 or tile_k <= 0:
+                return False
+            if warp_m <= 0 or warp_n <= 0 or warp_k <= 0:
+                return False
+            if warp_tile_m <= 0 or warp_tile_n <= 0 or warp_tile_k <= 0:
+                return False
+
+            # Basic divisibility check
+            if tile_m % (warp_m * warp_tile_m) != 0:
+                return False
+            if tile_n % (warp_n * warp_tile_n) != 0:
+                return False
+            if tile_k % (warp_k * warp_tile_k) != 0:
+                return False
+
+            return True
+        else:
+            # Full validation for generation
+            # Determine data types for validation
+            a_datatype = self.datatype
+            b_datatype = self.datatype
+            c_datatype = self.datatype
+
+            # Special handling for certain data types
+            if self.datatype in ["fp8", "bf8"]:
+                c_datatype = "fp16"
+
+            # Use the comprehensive validation function
+            return is_tile_config_valid(
+                tile_m,
+                tile_n,
+                tile_k,
+                warp_m,
+                warp_n,
+                warp_k,
+                warp_tile_m,
+                warp_tile_n,
+                warp_tile_k,
+                a_datatype,
+                b_datatype,
+                c_datatype,
+                pipeline,
+            )
+
+    def _generate_kernel_instance(
+        self, tile_config, trait_combo, k_block_per_cu, is_header=True
+    ):
+        """Generate a single kernel instance"""
+        (
+            pipeline,
+            epilogue,
+            scheduler,
+            pad_m,
+            pad_n,
+            pad_k,
+            persistent,
+        ) = trait_combo
+
+        # Create kernel name with proper boolean capitalization
+        kernel_name = f"gemm_preshuffle_{self.datatype}_{self.layout}_{pipeline}_{epilogue}_{scheduler}_{str(pad_m).capitalize()}_{str(pad_n).capitalize()}_{str(pad_k).capitalize()}_{str(persistent).capitalize()}"
+
+        # Create tile configuration string
+        tile_str = (
+            f"{tile_config['tile_m']}x{tile_config['tile_n']}x{tile_config['tile_k']}_"
+        )
+        tile_str += (
+            f"{tile_config['warp_m']}x{tile_config['warp_n']}x{tile_config['warp_k']}_"
+        )
+        tile_str += f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}x{tile_config['warp_tile_k']}"
+
+        kernel_name += f"_{tile_str}"
+
+        # Map pipeline names to the correct pipeline implementation
+        pipeline_impl_map = {
+            "preshufflev1": "ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV1",
+            "preshufflev2": "ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV2",
+        }
+
+        # Map pipeline names to base pipeline for hot loop detection
+        base_pipeline_map = {
+            "preshufflev1": "ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV1",
+            "preshufflev2": "ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2",
+        }
+
+        # Map scheduler names to the correct enum values
+        scheduler_type_map = {
+            "intrawave": "ck_tile::GemmPipelineScheduler::Intrawave",
+            "interwave": "ck_tile::GemmPipelineScheduler::Interwave",
+            "default": "ck_tile::GemmPipelineScheduler::Default",
+        }
+
+        # Determine accumulator type based on datatype
+        acc_type = "float"
+
+        # Determine output type
+        c_type = get_dtype_string(self.datatype)
+        if self.datatype in ["fp8", "bf8"]:
+            c_type = "ck_tile::fp16_t"
+
+        # Determine layouts based on self.layout
+        a_layout, b_layout, c_layout = get_abc_layouts(self.layout)
+
+        # Generate kernel instance code using the correct API
+        pragma_line = "#pragma once\n" if is_header else ""
+        instance_code = f"""// Generated kernel instance for {kernel_name}
+{pragma_line}
+#include <cstdint>
+#include <utility>
+#include <tuple>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
+#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
+
+using ADataType = {get_dtype_string(self.datatype)};
+using BDataType = {get_dtype_string(self.datatype)};
+using AccDataType = {acc_type};
+using CDataType = {c_type};
+
+using ALayout = {a_layout};
+using BLayout = {b_layout};
+using CLayout = {c_layout};
+
+// Kernel name for display
+constexpr const char* KERNEL_NAME = "{kernel_name}";
+
+// Wrapper for simplified launch interface
+struct SelectedKernel {{
+    // Tile configuration
+    static constexpr ck_tile::index_t BlockSize = 256;
+    static constexpr ck_tile::index_t TileM = {tile_config["tile_m"]};
+    static constexpr ck_tile::index_t TileN = {tile_config["tile_n"]};
+    static constexpr ck_tile::index_t TileK = {tile_config["tile_k"]};
+    static constexpr ck_tile::index_t WarpPerBlock_M = {tile_config["warp_m"]};
+    static constexpr ck_tile::index_t WarpPerBlock_N = {tile_config["warp_n"]};
+    static constexpr ck_tile::index_t WarpPerBlock_K = {tile_config["warp_k"]};
+    static constexpr ck_tile::index_t WarpTileM = {tile_config["warp_tile_m"]};
+    static constexpr ck_tile::index_t WarpTileN = {tile_config["warp_tile_n"]};
+    static constexpr ck_tile::index_t WarpTileK = {tile_config["warp_tile_k"]};
+
+    // Traits
+    static constexpr bool kPadM = {"true" if pad_m == "true" else "false"};
+    static constexpr bool kPadN = {"true" if pad_n == "true" else "false"};
+    static constexpr bool kPadK = {"true" if pad_k == "true" else "false"};
+    static constexpr bool TransposeC = false;
+    static constexpr bool UsePersistentKernel = {"true" if persistent == "true" else "false"};
+    static constexpr bool DoubleSmemBuffer = {"true" if pipeline == "preshufflev2" else "false"};
+    static constexpr bool UseStructuredSparsity = false;
+    static constexpr bool Preshuffle = true;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+
+    // Tile shape
+    using TileShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<TileM, TileN, TileK>,
+        ck_tile::sequence<WarpPerBlock_M, WarpPerBlock_N, WarpPerBlock_K>,
+        ck_tile::sequence<WarpTileM, WarpTileN, WarpTileK>,
+        false, false>;
+    
+    // Tile partitioner
+    using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<TileShape, 8, 4>;
+    
+    // Traits
+    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout, NumWaveGroups>;
+    
+    // Pipeline problem
+    using GemmPipelineProblem = ck_tile::GemmPipelineProblem<
+        ADataType,
+        BDataType,
+        AccDataType,
+        TileShape,
+        Traits>;
+    
+    // Base pipeline for hot loop detection
+    using BaseGemmPipeline = {base_pipeline_map.get(pipeline, "ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2")}<GemmPipelineProblem>;
+
+    static float launch(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{
+        const ck_tile::index_t k_grain = args.k_batch * TileK;
+        const ck_tile::index_t K_split = (args.K + k_grain - 1) / k_grain * TileK;
+        const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+        
+        float ave_time{{0}};
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {{
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v = tail_number_.value;
+            constexpr auto scheduler = {scheduler_type_map.get(scheduler, "ck_tile::GemmPipelineScheduler::Default")};
+            [[maybe_unused]] constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<
+                ADataType,
+                BDataType,
+                AccDataType,
+                TileShape,
+                ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
+                                                ALayout, BLayout, CLayout, TransposeC,
+                                                UseStructuredSparsity, UsePersistentKernel,
+                                                NumWaveGroups, Preshuffle>,
+                scheduler,
+                has_hot_loop_v,
+                tail_number_v>;
+            
+            using GemmPipeline = {pipeline_impl_map.get(pipeline, "ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV2")}<UniversalGemmProblem>;
+            
+            // Epilogue
+"""
+
+        # Add epilogue configuration based on type
+        if epilogue == "cshuffle":
+            instance_code += """            using EpilogueProblem = ck_tile::CShuffleEpilogueProblem<
+                ADataType,
+                BDataType,
+                ck_tile::tuple<>,  // DsDataType
+                AccDataType,
+                CDataType,
+                ck_tile::tuple<>,  // DsLayout
+                CLayout,
+                ck_tile::element_wise::PassThrough,
+                TilePartitioner::MPerBlock,  // kM_
+                TilePartitioner::NPerBlock,  // kN_
+                WarpPerBlock_M,              // MWave_
+                WarpPerBlock_N,              // NWave_
+                WarpTileM,                   // MPerXdl_
+                WarpTileN,                   // NPerXdl_
+                WarpTileK,                   // KPerXdl_
+                TransposeC,                  // isCTransposed_
+                memory_operation,            // MemoryOperation_
+                NumWaveGroups>;              // kNumWaveGroups_
+            
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<EpilogueProblem>;
+"""
+        else:  # default epilogue
+            instance_code += """            using EpilogueProblem = ck_tile::DefaultGemm2DEpilogueProblem<
+                ADataType,
+                BDataType,
+                ck_tile::tuple<>,  // DsDataType
+                AccDataType,
+                CDataType,
+                ck_tile::tuple<>,  // DsLayout
+                CLayout,
+                ck_tile::element_wise::PassThrough,
+                TilePartitioner::MPerBlock,  // kM_
+                TilePartitioner::NPerBlock,  // kN_
+                kPadM,
+                kPadN,
+                WarpTileM,  // kMPerXdl_
+                WarpTileN,  // kNPerXdl_
+                WarpTileK,  // kKPerXdl_
+                TransposeC>;  // isCTransposed_
+            
+            using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<EpilogueProblem>;
+"""
+
+        instance_code += f"""
+            
+            // Kernel type
+            using GemmKernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            
+            // Make kernel arguments
+            auto kargs = GemmKernel::MakeKernelArgs(args);
+            
+            if (!GemmKernel::IsSupportedArgument(kargs)) {{
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
+            }}
+            
+            // Get grid and block sizes
+            const dim3 grids = {"GemmKernel::MaxOccupancyGridSize(stream)" if persistent == "true" else "GemmKernel::GridSize(args.M, args.N, args.k_batch)"};
+            const dim3 blocks = GemmKernel::BlockSize();
+            
+            if(stream.log_level_ > 0) {{
+                std::cout << "Launching kernel with args: " << GemmKernel::GetName() << '\\n'
+                          << "grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}"
+                          << ", blocks: {{" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}}"
+                          << std::endl;
+            }}
+            
+            // Launch kernel
+            constexpr int kBlockPerCu = {k_block_per_cu};
+            ave_time = ck_tile::launch_kernel(
+                stream,
+                ck_tile::make_kernel<kBlockPerCu>(GemmKernel{{}}, grids, blocks, 0, kargs));
+            
+            return ave_time;
+        }};
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {{
+            if(args.k_batch == 1) {{
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                            ck_tile::memory_operation_enum::set>{{}});
+            }} else {{
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                            ck_tile::memory_operation_enum::atomic_add>{{}});
+            }}
+        }};
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+        return ave_time;
+    }}
+}};
+"""
+
+        return kernel_name, instance_code
+
+    def run(self, num_workers=None):
+        """Run the builder to generate individual kernel files"""
+        # Generate individual kernel files
+        self.generate_individual(num_workers)
+
+    def generate_individual(self, num_workers=None):
+        """Generate individual kernel files for separate compilation with parallel processing"""
+        if num_workers is None:
+            num_workers = min(
+                multiprocessing.cpu_count(), 8
+            )  # Limit to avoid memory issues
+
+        tile_configs = self._get_tile_configs()
+        trait_combos = self._generate_trait_combinations()
+        k_block_per_cu = self.config.get("k_block_per_cu")
+
+        # Prepare work items for parallel processing
+        work_items = []
+        for tile_config in tile_configs:
+            for trait_combo in trait_combos:
+                work_items.append(
+                    (
+                        tile_config,
+                        trait_combo,
+                        k_block_per_cu,
+                        self.working_path,
+                        self.datatype,
+                        self.layout,
+                    )
+                )
+
+        print(
+            f"Generating {len(work_items)} individual kernel files using {num_workers} workers..."
+        )
+        print(f"  Tile configs: {len(tile_configs)}")
+        print(f"  Trait combinations: {len(trait_combos)}")
+        print(f"  Total kernels: {len(work_items)}")
+
+        # Show first few work items for debugging
+        if work_items:
+            print("  First work item example:")
+            tile_config, trait_combo = work_items[0][:2]
+            print(f"    Tile config: {tile_config}")
+            print(f"    Trait combo: {trait_combo[:3]}")  # Show first 3 traits
+
+        # Process work items in parallel
+        kernel_list = []
+        completed = 0
+
+        with concurrent.futures.ProcessPoolExecutor(
+            max_workers=num_workers
+        ) as executor:
+            # Submit all work items
+            print(f"  Submitting {len(work_items)} tasks to executor...")
+            future_to_item = {
+                executor.submit(_generate_single_kernel_individual, item): item
+                for item in work_items
+            }
+            print("  All tasks submitted, waiting for completion...")
+
+            # Collect results with progress reporting
+            for future in concurrent.futures.as_completed(future_to_item):
+                completed += 1
+                if completed % 100 == 0 or completed == len(work_items):
+                    print(
+                        f"  Progress: {completed}/{len(work_items)} kernels generated"
+                    )
+
+                try:
+                    result = future.result()
+                    if result:
+                        kernel_list.append(result)
+                except Exception as exc:
+                    item = future_to_item[future]
+                    print(f"Kernel generation failed for {item}: {exc}")
+
+        # Sort kernel list for consistent ordering
+        kernel_list.sort(key=lambda x: x[0])  # Sort by kernel name
+
+        # Generate CMake include file for individual targets
+        self._generate_cmake_individual_targets(kernel_list)
+
+        print(
+            f"Generated {len(kernel_list)} individual kernel files in {self.working_path}"
+        )
+
+    def _generate_cmake_individual_targets(self, kernel_list):
+        """Generate CMake include file that creates individual targets"""
+        cmake_code = f"""# Generated CMake file for individual GEMM Preshuffle targets
+# Datatype: {self.datatype}, Layout: {self.layout}
+
+"""
+
+        for kernel_name, trait_combo, tile_config in kernel_list:
+            pipeline, epilogue, scheduler = trait_combo[:3]
+
+            # Format tile config for CMake function
+            tile_str = f"{tile_config['tile_m']}x{tile_config['tile_n']}x{tile_config['tile_k']}_"
+            tile_str += f"{tile_config['warp_m']}x{tile_config['warp_n']}x{tile_config['warp_k']}_"
+            tile_str += f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}x{tile_config['warp_tile_k']}"
+
+            trait_str = f"{pipeline}_{epilogue}_{scheduler}_" + "_".join(
+                str(x) for x in trait_combo[3:]
+            )
+
+            cmake_code += f'create_individual_gemm_preshuffle_target("{self.datatype}" "{self.layout}" "{trait_str}" "{tile_str}")\n'
+
+        # Write CMake include file
+        with open(
+            self.working_path / "gemm_preshuffle_individual_targets.cmake", "w"
+        ) as f:
+            f.write(cmake_code)
+
+
+def _generate_single_kernel_individual(work_item):
+    """Worker function to generate a single individual kernel file"""
+    tile_config, trait_combo, k_block_per_cu, working_path, datatype, layout = work_item
+
+    # Create a temporary builder instance for this worker
+    builder = GemmPreshuffleKernelBuilder(working_path, datatype, layout)
+
+    try:
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            tile_config, trait_combo, k_block_per_cu
+        )
+
+        # Create simplified filename without the "gemm_" prefix
+        # Remove "gemm_" from the beginning of kernel_name for the filename
+        simplified_name = kernel_name
+        if simplified_name.startswith("gemm_"):
+            simplified_name = simplified_name[5:]  # Remove "gemm_" prefix
+
+        # Write individual header file
+        header_file = working_path / f"gemm_single_{simplified_name}.hpp"
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        return (kernel_name, trait_combo, tile_config)
+    except Exception as e:
+        print(f"Error generating individual kernel: {e}")
+        return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="GEMM kernel instance builder with parallel support"
+    )
+    parser.add_argument("--working_path", required=True, help="Working directory path")
+    parser.add_argument(
+        "--datatype",
+        required=True,
+        choices=["fp16", "fp8", "bf16", "bf8"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--layout",
+        required=True,
+        choices=["rcr", "rrr", "ccr", "crr"],
+        help="Matrix layout",
+    )
+    parser.add_argument("--config_json", required=True, help="Configuration JSON file")
+    parser.add_argument(
+        "--num_workers", type=int, help="Number of parallel workers (default: auto)"
+    )
+    parser.add_argument(
+        "--gen_individual", action="store_true", help="Generate individual kernel files"
+    )
+    parser.add_argument(
+        "--gen_single", action="store_true", help="Generate a single kernel file"
+    )
+    parser.add_argument("--kernel_name", help="Kernel name for single generation")
+    parser.add_argument(
+        "--tile_config", help="Tile configuration string for single generation"
+    )
+    parser.add_argument(
+        "--trait_combo", help="Trait combination string for single generation"
+    )
+    parser.add_argument(
+        "--list_kernels",
+        action="store_true",
+        help="List kernel configurations without generating files",
+    )
+
+    args = parser.parse_args()
+
+    assert args.datatype in ["fp16", "bf16", "fp8", "bf8"], (
+        f"Invalid datatype string: {args.datatype} (supported datatypes are [fp16, bf16, fp8, and bf8])"
+    )
+
+    layout_parts = args.layout.lower()
+    assert len(layout_parts) == 3, (
+        f"Invalid layout string: {args.layout} (must be 3 characters like 'rcr' where r stands for row major and c stands for column major)"
+    )
+    assert layout_parts[0] == "r" and layout_parts[1] == "c", (
+        f"Invalid matrix_a layout : {layout_parts[0]} or matrix_b layout: {layout_parts[1]} (matrix_a must be 'r' for row major and matrix_b must be 'c' for column major as it is the only supported layout for preshuffle)"
+    )
+    assert layout_parts[2] == "r", (
+        f"Invalid matrix_c layout: {layout_parts[2]} (must be 'r' only as currently we are supporting only row major)"
+    )
+
+    # Create builder
+    builder = GemmPreshuffleKernelBuilder(
+        args.working_path, args.datatype, args.layout, args.config_json
+    )
+
+    if args.list_kernels:
+        # Fast listing mode - just write kernel list without generating files
+        builder.write_kernel_list()
+        pass
+    elif args.gen_single:
+        # Generate a single kernel file
+        if not args.kernel_name or not args.tile_config or not args.trait_combo:
+            parser.error(
+                "--gen_single requires --kernel_name, --tile_config, and --trait_combo"
+            )
+        # Parse tile config
+        tile_parts = args.tile_config.split("_")
+        tile_dims = tile_parts[0].split("x")
+        warp_dims = tile_parts[1].split("x")
+        warp_tile_dims = tile_parts[2].split("x")
+
+        tile_config = {
+            "tile_m": int(tile_dims[0]),
+            "tile_n": int(tile_dims[1]),
+            "tile_k": int(tile_dims[2]),
+            "warp_m": int(warp_dims[0]),
+            "warp_n": int(warp_dims[1]),
+            "warp_k": int(warp_dims[2]),
+            "warp_tile_m": int(warp_tile_dims[0]),
+            "warp_tile_n": int(warp_tile_dims[1]),
+            "warp_tile_k": int(warp_tile_dims[2]),
+        }
+
+        # Parse trait combo
+        trait_parts = args.trait_combo.split("_")
+        trait_combo = (
+            trait_parts[0],  # pipeline
+            trait_parts[1],  # epilogue
+            trait_parts[2],  # scheduler
+            trait_parts[3] == "True",  # pad_m
+            trait_parts[4] == "True",  # pad_n
+            trait_parts[5] == "True",  # pad_k
+            trait_parts[6] == "True",  # persistent
+        )
+
+        k_block_per_cu = builder.config.get("k_block_per_cu")
+
+        # Generate the kernel
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            tile_config, trait_combo, k_block_per_cu
+        )
+
+        # Write the file
+        simplified_name = kernel_name
+        if simplified_name.startswith("gemm_preshuffle_"):
+            simplified_name = simplified_name[16:]
+
+        header_file = (
+            builder.working_path / f"gemm_preshuffle_single_{simplified_name}.hpp"
+        )
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        print(f"Generated {header_file}")
+
+    elif args.gen_individual:
+        # Generate all individual kernel files
+        builder.run(args.num_workers)
+        pass
+    else:
+        parser.error(
+            "Must specify one of: --list_kernels, --gen_individual, or --gen_single"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_profiler.hpp b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_profiler.hpp
new file mode 100644
index 0000000000..4f2a929ba0
--- /dev/null
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_profiler.hpp
@@ -0,0 +1,275 @@
+#pragma once
+
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "benchmark_gemm_preshuffle.hpp"
+
+class GemmProfiler
+{
+    public:
+    static GemmProfiler& instance(Setting setting)
+    {
+        static GemmProfiler instance{setting};
+        return instance;
+    }
+
+    // Overload for single kernel benchmarking
+    void benchmark(GemmProblem& gemm_problem,
+                   std::function<float(const ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>
+                       kernel_func,
+                   const std::tuple<int, int, int>& warp_tile_dims)
+    {
+        // Create a vector with a single callable that returns both name and time
+        std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs&,
+                                                                 const ck_tile::stream_config&)>>
+            callables;
+
+        callables.push_back(
+            [kernel_func](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {
+                float time = kernel_func(args, stream);
+                return std::make_tuple(std::string(KERNEL_NAME), time);
+            });
+
+        benchmark(gemm_problem, callables, warp_tile_dims);
+    }
+
+    void benchmark(GemmProblem& gemm_problem,
+                   std::vector<std::function<std::tuple<std::string, float>(
+                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables,
+                   const std::tuple<int, int, int>& warp_tile_dims)
+    {
+        const ALayout layout_a = ALayout{};
+        const BLayout layout_b = BLayout{};
+        const CLayout layout_c = CLayout{};
+
+        gemm_problem.stride_a_ = ck_tile::get_default_stride(
+            gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a));
+        gemm_problem.stride_b_ = ck_tile::get_default_stride(
+            gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b));
+        gemm_problem.stride_c_ = ck_tile::get_default_stride(
+            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c));
+
+        ck_tile::HostTensor<ADataType> a_m_k(ck_tile::host_tensor_descriptor(
+            gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a)));
+        ck_tile::HostTensor<BDataType> b_k_n(ck_tile::host_tensor_descriptor(
+            gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b)));
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(ck_tile::host_tensor_descriptor(
+            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)));
+
+        if(setting_.init_method_ == 0)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_k_n);
+        }
+        else if(setting_.init_method_ == 1)
+        {
+            ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+            ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+        }
+        else if(setting_.init_method_ == 2)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
+        }
+        else
+        {
+            a_m_k.SetZero();
+            b_k_n.SetZero();
+        }
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+        // Reference Verification
+        ck_tile::HostTensor<CDataType> c_m_n_ref(ck_tile::host_tensor_descriptor(
+            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)));
+        c_m_n_ref.SetZero();
+
+        if(setting_.verify_)
+        {
+            gemm_host_reference(setting_.verify_,
+                                a_m_k,
+                                b_k_n,
+                                c_m_n_ref,
+                                a_m_k_dev_buf,
+                                b_k_n_dev_buf,
+                                gemm_problem.m_,
+                                gemm_problem.n_,
+                                gemm_problem.k_,
+                                gemm_problem.stride_a_,
+                                gemm_problem.stride_b_,
+                                gemm_problem.stride_c_);
+        }
+
+        // Kerenl Execution
+
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+
+        for(const auto& callable : callables)
+        {
+            ck_tile::index_t N_Warp_Tile = std::get<1>(warp_tile_dims);
+            ck_tile::index_t K_Warp_Tile = std::get<2>(warp_tile_dims);
+
+            ck_tile::HostTensor<BDataType> b_shuffle_host =
+                shuffle_b(b_k_n, N_Warp_Tile, K_Warp_Tile);
+            b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
+
+            ck_tile::GemmHostArgs gemm_args = {
+                a_m_k_dev_buf.GetDeviceBuffer(),
+                b_k_n_dev_buf.GetDeviceBuffer(),
+                c_m_n_dev_buf.GetDeviceBuffer(),
+                gemm_problem.split_k_,
+                gemm_problem.m_,
+                gemm_problem.n_,
+                gemm_problem.k_,
+                gemm_problem.stride_a_,
+                gemm_problem.stride_b_,
+                gemm_problem.stride_c_,
+            };
+
+            auto kernel_run_result = callable(gemm_args,
+                                              ck_tile::stream_config{nullptr,
+                                                                     true,
+                                                                     setting_.log_,
+                                                                     setting_.n_warmup_,
+                                                                     setting_.n_repeat_,
+                                                                     setting_.is_gpu_timer_,
+                                                                     setting_.flush_cache_,
+                                                                     setting_.rotating_count_});
+
+            process_result(
+                gemm_problem, c_m_n_dev_buf, c_m_n_ref, c_m_n_dev_result, kernel_run_result);
+        }
+    }
+
+    void process_result(const GemmProblem& gemm_problem,
+                        ck_tile::DeviceMem& c_m_n_dev_buf,
+                        ck_tile::HostTensor<CDataType>& c_m_n_ref,
+                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                        const std::tuple<std::string, float>& kernel_run_result)
+    {
+        auto [name, avg_time] = kernel_run_result;
+
+        KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
+
+        // compute performance metric
+        std::size_t flop     = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_;
+        std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ +
+                               sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
+                               sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
+
+        // update
+        kernel_instance.perf_result_.latency_   = avg_time;
+        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
+        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
+
+        if(setting_.log_ > 0 && !setting_.json_output_)
+        {
+            std::cout << kernel_instance << std::endl;
+        }
+
+        // verify result
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+
+        bool verified_correct =
+            !setting_.verify_ ||
+            compare(name, gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_ref);
+
+        if(verified_correct)
+        {
+            kernel_instances_.emplace_back(kernel_instance);
+        }
+        else
+        {
+            std::cout << "Verification failed, skip kernel: " << name << std::endl;
+        }
+
+        // clear tensor
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+    }
+
+    KernelInstance select_best_instance(Metric metric)
+    {
+        if(kernel_instances_.empty())
+            throw std::runtime_error("Empty instances");
+
+        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
+                                                 kernel_instances_.end(),
+                                                 [metric](const auto& a, const auto& b) {
+                                                     return PerformanceResult::compare(
+                                                         b.perf_result_, a.perf_result_, metric);
+                                                 });
+
+        if(setting_.json_output_)
+        {
+            // Output clean JSON only
+            std::cout << kernel_instance << std::endl;
+        }
+        else
+        {
+            std::cout << "**********************************" << std::endl;
+            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
+                      << "Current kernel performance is: " << kernel_instance << std::endl;
+            std::cout << "**********************************" << std::endl;
+        }
+
+        if(!setting_.csv_filename_.empty())
+        {
+            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
+
+            if(!file.is_open())
+            {
+                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
+            }
+            else
+            {
+                if(file.tellp() == 0)
+                {
+                    file << "rocm_version,device_name,"
+                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
+                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
+                         << "structured_sparsity," << "name,"
+                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
+                }
+
+                const auto& problem = kernel_instance.problem_;
+                const auto& name    = kernel_instance.name_;
+                const auto& perf    = kernel_instance.perf_result_;
+
+                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
+                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
+                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
+                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
+                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
+                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
+                     << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed
+                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
+                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
+                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
+                     << "\n";
+
+                if(!file)
+                {
+                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
+                }
+            }
+        }
+
+        return kernel_instance;
+    }
+
+    GemmProfiler(const GemmProfiler&)            = delete;
+    GemmProfiler& operator=(const GemmProfiler&) = delete;
+
+    private:
+    ~GemmProfiler() { kernel_instances_.clear(); }
+    GemmProfiler(Setting setting) : setting_(setting) {}
+
+    Setting setting_;
+
+    std::vector<KernelInstance> kernel_instances_;
+};